Repository: UCLA-VAST/AutoSA
Branch: master
Commit: b61a1b4132d6
Files: 587
Total size: 8.9 MB

Directory structure:
gitextract_nqdxn4c0/

├── .dockerignore
├── .gitignore
├── .gitmodules
├── ChangeLog
├── Dockerfile
├── LICENSE
├── README.md
├── autosa_config/
│   ├── autosa_config.json
│   ├── hw_info.json
│   ├── hw_info_libs/
│   │   ├── hw_info.json.ku3
│   │   ├── hw_info.json.u200
│   │   └── hw_info.json.u250
│   ├── module_group.json
│   ├── optimizer_settings.json
│   └── optimizer_settings_libs/
│       ├── gemm3_fp32.json
│       ├── gemm3_int16.json
│       ├── gemm3_int16_32.json
│       ├── gemm3_int8.json
│       ├── gemm3_int8_64.json
│       ├── gemm4_fp32.json
│       ├── mm_small.json
│       ├── mttkrp_fp32.json
│       ├── ttm_fp32.json
│       └── ttmc_fp32.json
├── autosa_scripts/
│   ├── autosa.py
│   ├── codegen.py
│   ├── hls_scripts/
│   │   ├── hls_script.tcl
│   │   └── hls_script_synth.tcl
│   ├── intel_opencl_scripts/
│   │   ├── Makefile
│   │   ├── common/
│   │   │   ├── inc/
│   │   │   │   └── AOCLUtils/
│   │   │   │       ├── aocl_utils.h
│   │   │   │       ├── opencl.h
│   │   │   │       ├── options.h
│   │   │   │       └── scoped_ptrs.h
│   │   │   ├── readme.css
│   │   │   └── src/
│   │   │       └── AOCLUtils/
│   │   │           ├── opencl.cpp
│   │   │           └── options.cpp
│   │   └── compile_design.sh
│   ├── latency_model.py
│   ├── module_group.py
│   ├── odyssey/
│   │   ├── RL_utils.py
│   │   ├── analyze.py
│   │   ├── clean_up.sh
│   │   ├── cst/
│   │   │   ├── hw_cst.json
│   │   │   ├── single_test.json
│   │   │   ├── u250.json
│   │   │   └── vu9p.json
│   │   ├── design.py
│   │   ├── designs/
│   │   │   └── kernel3.json
│   │   ├── designs_lib/
│   │   │   ├── cnn/
│   │   │   │   ├── kernel0_0.json
│   │   │   │   ├── kernel0_1.json
│   │   │   │   ├── kernel0_2.json
│   │   │   │   ├── kernel1_0.json
│   │   │   │   ├── kernel1_1.json
│   │   │   │   ├── kernel1_2.json
│   │   │   │   ├── kernel2_0.json
│   │   │   │   ├── kernel2_1.json
│   │   │   │   ├── kernel2_2.json
│   │   │   │   ├── kernel3_0.json
│   │   │   │   ├── kernel3_1.json
│   │   │   │   ├── kernel3_2.json
│   │   │   │   ├── kernel4_0.json
│   │   │   │   ├── kernel4_1.json
│   │   │   │   ├── kernel4_2.json
│   │   │   │   ├── kernel5_0.json
│   │   │   │   ├── kernel5_1.json
│   │   │   │   ├── kernel5_2.json
│   │   │   │   ├── kernel6_0.json
│   │   │   │   ├── kernel6_1.json
│   │   │   │   ├── kernel6_2.json
│   │   │   │   ├── kernel7_0.json
│   │   │   │   ├── kernel7_1.json
│   │   │   │   ├── kernel7_2.json
│   │   │   │   ├── kernel8_0.json
│   │   │   │   ├── kernel8_1.json
│   │   │   │   ├── kernel8_2.json
│   │   │   │   ├── kernel9_0.json
│   │   │   │   ├── kernel9_1.json
│   │   │   │   └── kernel9_2.json
│   │   │   └── gemm/
│   │   │       ├── kernel0_0.json
│   │   │       ├── kernel0_1.json
│   │   │       ├── kernel0_2.json
│   │   │       ├── kernel1_0.json
│   │   │       ├── kernel1_1.json
│   │   │       ├── kernel1_2.json
│   │   │       ├── kernel2_0.json
│   │   │       ├── kernel2_1.json
│   │   │       ├── kernel2_2.json
│   │   │       ├── kernel3_0.json
│   │   │       ├── kernel3_1.json
│   │   │       ├── kernel3_2.json
│   │   │       ├── kernel4_0.json
│   │   │       ├── kernel4_1.json
│   │   │       ├── kernel4_2.json
│   │   │       ├── kernel5_0.json
│   │   │       ├── kernel5_1.json
│   │   │       └── kernel5_2.json
│   │   ├── explorer.py
│   │   ├── main.py
│   │   ├── requirements.txt
│   │   ├── scripts/
│   │   │   ├── compute_network_info.py
│   │   │   ├── grid_search_xgb_params.py
│   │   │   ├── img2col.py
│   │   │   ├── run_arch1.sh
│   │   │   ├── run_arch1_free.sh
│   │   │   ├── run_arch1_ml_cmp.sh
│   │   │   ├── run_arch2.sh
│   │   │   ├── run_arch3.sh
│   │   │   ├── run_arch4.sh
│   │   │   ├── run_dataflow_cmp_cnn.sh
│   │   │   ├── run_dataflow_cmp_mm.sh
│   │   │   ├── run_dataflow_cmp_mm_energy.sh
│   │   │   ├── run_img2col_single.sh
│   │   │   ├── run_method_cmp.sh
│   │   │   ├── run_metric_cmp.sh
│   │   │   ├── run_mutation_cmp.sh
│   │   │   └── split_cnn_layers.py
│   │   ├── search_task.py
│   │   ├── solver.py
│   │   ├── tuners.py
│   │   ├── unit_test.py
│   │   ├── utils.py
│   │   └── workload/
│   │       ├── conv.json
│   │       ├── mm.json
│   │       ├── mm64.json
│   │       ├── mobilenetv2.json
│   │       ├── mobilenetv2_1.json
│   │       ├── mobilenetv2_10.json
│   │       ├── mobilenetv2_11.json
│   │       ├── mobilenetv2_12.json
│   │       ├── mobilenetv2_13.json
│   │       ├── mobilenetv2_14.json
│   │       ├── mobilenetv2_15.json
│   │       ├── mobilenetv2_16.json
│   │       ├── mobilenetv2_17.json
│   │       ├── mobilenetv2_18.json
│   │       ├── mobilenetv2_19.json
│   │       ├── mobilenetv2_2.json
│   │       ├── mobilenetv2_20.json
│   │       ├── mobilenetv2_21.json
│   │       ├── mobilenetv2_22.json
│   │       ├── mobilenetv2_23.json
│   │       ├── mobilenetv2_24.json
│   │       ├── mobilenetv2_25.json
│   │       ├── mobilenetv2_26.json
│   │       ├── mobilenetv2_27.json
│   │       ├── mobilenetv2_28.json
│   │       ├── mobilenetv2_29.json
│   │       ├── mobilenetv2_3.json
│   │       ├── mobilenetv2_30.json
│   │       ├── mobilenetv2_31.json
│   │       ├── mobilenetv2_32.json
│   │       ├── mobilenetv2_33.json
│   │       ├── mobilenetv2_34.json
│   │       ├── mobilenetv2_35.json
│   │       ├── mobilenetv2_36.json
│   │       ├── mobilenetv2_4.json
│   │       ├── mobilenetv2_47.json
│   │       ├── mobilenetv2_5.json
│   │       ├── mobilenetv2_6.json
│   │       ├── mobilenetv2_7.json
│   │       ├── mobilenetv2_8.json
│   │       ├── mobilenetv2_9.json
│   │       ├── mobilenetv2_complete.json
│   │       ├── mobilenetv2_conv3_1_0.json
│   │       ├── mobilenetv2_first.json
│   │       ├── mobilenetv2_first1.json
│   │       ├── mobilenetv2_first2.json
│   │       ├── mobilenetv2_half.json
│   │       ├── mobilenetv2_img2col.json
│   │       ├── mobilenetv2_no_first.json
│   │       ├── mobilenetv2_original.json
│   │       ├── mobilenetv2_test.json
│   │       ├── mobilenetv2_test_single.json
│   │       ├── resnet152.json
│   │       ├── resnet50.json
│   │       ├── resnet50_1.json
│   │       ├── resnet50_10.json
│   │       ├── resnet50_11.json
│   │       ├── resnet50_12.json
│   │       ├── resnet50_13.json
│   │       ├── resnet50_14.json
│   │       ├── resnet50_15.json
│   │       ├── resnet50_16.json
│   │       ├── resnet50_17.json
│   │       ├── resnet50_18.json
│   │       ├── resnet50_19.json
│   │       ├── resnet50_2.json
│   │       ├── resnet50_20.json
│   │       ├── resnet50_21.json
│   │       ├── resnet50_22.json
│   │       ├── resnet50_23.json
│   │       ├── resnet50_24.json
│   │       ├── resnet50_25.json
│   │       ├── resnet50_26.json
│   │       ├── resnet50_27.json
│   │       ├── resnet50_28.json
│   │       ├── resnet50_29.json
│   │       ├── resnet50_3.json
│   │       ├── resnet50_30.json
│   │       ├── resnet50_31.json
│   │       ├── resnet50_32.json
│   │       ├── resnet50_33.json
│   │       ├── resnet50_34.json
│   │       ├── resnet50_35.json
│   │       ├── resnet50_36.json
│   │       ├── resnet50_37.json
│   │       ├── resnet50_38.json
│   │       ├── resnet50_39.json
│   │       ├── resnet50_4.json
│   │       ├── resnet50_40.json
│   │       ├── resnet50_41.json
│   │       ├── resnet50_42.json
│   │       ├── resnet50_43.json
│   │       ├── resnet50_44.json
│   │       ├── resnet50_45.json
│   │       ├── resnet50_46.json
│   │       ├── resnet50_47.json
│   │       ├── resnet50_48.json
│   │       ├── resnet50_49.json
│   │       ├── resnet50_5.json
│   │       ├── resnet50_6.json
│   │       ├── resnet50_7.json
│   │       ├── resnet50_8.json
│   │       ├── resnet50_9.json
│   │       ├── resnet50_batch4.json
│   │       ├── resnet50_conv5_1.json
│   │       ├── resnet50_img2col.json
│   │       ├── resnet50_last.json
│   │       ├── resnet50_last2.json
│   │       ├── resnet50_original.json
│   │       ├── vgg16-2-img2col.json
│   │       ├── vgg16-3.json
│   │       ├── vgg16-4.json
│   │       ├── vgg16.json
│   │       ├── vgg16_1.json
│   │       ├── vgg16_10.json
│   │       ├── vgg16_11.json
│   │       ├── vgg16_12.json
│   │       ├── vgg16_13.json
│   │       ├── vgg16_2.json
│   │       ├── vgg16_3.json
│   │       ├── vgg16_4.json
│   │       ├── vgg16_5.json
│   │       ├── vgg16_6.json
│   │       ├── vgg16_7.json
│   │       ├── vgg16_8.json
│   │       ├── vgg16_9.json
│   │       └── vgg16_img2col.json
│   ├── optimizer.py
│   ├── optimizer_prune.py
│   ├── pe_group.py
│   ├── ppcg_changes/
│   │   ├── isl/
│   │   │   ├── ast_type.h
│   │   │   ├── files.txt
│   │   │   ├── isl_patch.sh
│   │   │   ├── isl_schedule.c
│   │   │   ├── isl_schedule_band.c
│   │   │   ├── isl_schedule_band.h
│   │   │   ├── isl_schedule_node.c
│   │   │   ├── isl_schedule_tree.c
│   │   │   ├── isl_schedule_tree.h
│   │   │   ├── schedule.h
│   │   │   ├── schedule_node.h
│   │   │   └── vec.h
│   │   └── ppcg/
│   │       └── files.txt
│   ├── resource_model.py
│   ├── tapa_scripts/
│   │   └── CMakeLists.txt
│   ├── tuner/
│   │   ├── constraint.py
│   │   ├── cst/
│   │   │   └── hw_cst.json
│   │   ├── design.py
│   │   ├── main.py
│   │   ├── search_task.py
│   │   ├── task/
│   │   │   ├── cnn.json
│   │   │   ├── mm.json
│   │   │   └── mm2.json
│   │   ├── tuner.py
│   │   ├── unit_test.py
│   │   └── utils.py
│   ├── tuning_scripts/
│   │   ├── cnn.sh
│   │   ├── gemm.sh
│   │   └── model_validate.sh
│   └── vitis_scripts/
│       ├── Makefile
│       └── connectivity.cfg
├── autosa_tests/
│   ├── cnn/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   ├── param_names.json
│   │   └── simd_info.json
│   ├── dnn_ops/
│   │   ├── dc_simd_info.json
│   │   ├── fc_simd_info.json
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── pc_simd_info.json
│   ├── large/
│   │   ├── cnn/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── connectivity.cfg
│   │   │   ├── hls_script.tcl
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   ├── simd_info.json
│   │   │   ├── step1-run-hls.tcl
│   │   │   ├── step2-autobridge.py
│   │   │   ├── step3-pack-xo.tcl
│   │   │   └── step4-run-vitis.sh
│   │   ├── mm/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── connectivity.cfg
│   │   │   ├── hls_script.tcl
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   ├── simd_info.json
│   │   │   ├── step1-run-hls.tcl
│   │   │   ├── step2-autobridge.py
│   │   │   ├── step3-pack-xo.tcl
│   │   │   └── step4-run-vitis.sh
│   │   ├── mm_block_sparse/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── connectivity.cfg
│   │   │   ├── hls_script.tcl
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   └── simd_info.json
│   │   ├── mm_int16/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── code.c
│   │   │   ├── connectivity.cfg
│   │   │   ├── hls_script.tcl
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   ├── simd_info.json
│   │   │   ├── step1-run-hls.tcl
│   │   │   ├── step2-autobridge.py
│   │   │   ├── step3-pack-xo.tcl
│   │   │   ├── step4-run-vitis.sh
│   │   │   └── unroll.py
│   │   ├── mm_int8/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── code.c
│   │   │   ├── connectivity.cfg
│   │   │   ├── hls_script.tcl
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   ├── kernel_kernel_opt.cpp
│   │   │   ├── simd_info.json
│   │   │   ├── step1-run-hls.tcl
│   │   │   ├── step2-autobridge.py
│   │   │   ├── step3-pack-xo.tcl
│   │   │   ├── step4-run-vitis.sh
│   │   │   └── unroll.py
│   │   ├── mm_intel/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   └── simd_info.json
│   │   ├── mttkrp/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── connectivity.cfg
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   ├── simd_info.json
│   │   │   ├── step1-run-hls.tcl
│   │   │   ├── step2-autobridge.py
│   │   │   ├── step3-pack-xo.tcl
│   │   │   └── step4-run-vitis.sh
│   │   ├── ttm/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── connectivity.cfg
│   │   │   ├── kernel.c
│   │   │   ├── kernel.h
│   │   │   └── simd_info.json
│   │   └── ttmc/
│   │       ├── Makefile
│   │       ├── README.md
│   │       ├── connectivity.cfg
│   │       ├── kernel.c
│   │       ├── kernel.h
│   │       ├── simd_info.json
│   │       ├── step1-run-hls.tcl
│   │       ├── step2-autobridge.py
│   │       ├── step3-pack-xo.tcl
│   │       └── step4-run-vitis.sh
│   ├── lu/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── add_batch.py
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   ├── mm/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   ├── param_names.json
│   │   └── simd_info.json
│   ├── mm_block_sparse/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   ├── mm_catapult/
│   │   ├── README.md
│   │   ├── directives.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   ├── kernel_kernel_hw.h
│   │   └── simd_info.json
│   ├── mm_getting_started/
│   │   ├── Makefile
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   ├── mm_hbm/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   ├── mm_hcl/
│   │   ├── README.md
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   ├── mm_hcl_intel/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   ├── kernel2.c
│   │   └── simd_info.json
│   ├── mm_int16/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── connectivity.cfg
│   │   ├── hls_script.tcl
│   │   ├── kernel.c
│   │   ├── kernel.h
│   │   └── simd_info.json
│   └── mm_intel/
│       ├── Makefile
│       ├── README.md
│       ├── kernel.c
│       ├── kernel.h
│       └── simd_info.json
├── clean.sh
├── docs/
│   ├── Makefile
│   ├── conf.py
│   ├── docker_image.rst
│   ├── examples/
│   │   ├── cnn.rst
│   │   ├── cnn_large.rst
│   │   ├── dnn_ops.rst
│   │   ├── index.rst
│   │   ├── lu.rst
│   │   ├── mm.rst
│   │   ├── mm_block_sparse.rst
│   │   ├── mm_hbm.rst
│   │   ├── mm_int16.rst
│   │   ├── mm_int16_large.rst
│   │   ├── mm_int8_large.rst
│   │   ├── mm_large.rst
│   │   ├── mttkrp_large.rst
│   │   └── ttmc_large.rst
│   ├── index.rst
│   ├── install_from_source.rst
│   ├── installation.rst
│   ├── make.bat
│   └── tutorials/
│       ├── auto_bridge.rst
│       ├── auto_tuning_exhaustive.rst
│       ├── auto_tuning_genetic.rst
│       ├── catapult_backend.rst
│       ├── getting_started.rst
│       ├── hcl_integrate.rst
│       ├── host_serialize.rst
│       ├── index.rst
│       ├── intel_backend.rst
│       ├── matrix_multiplication.rst
│       ├── optimize_array.rst
│       ├── structural_sparsity.rst
│       └── theory_background.rst
├── install.sh
├── ltmain.sh
├── requirements.txt
└── src/
    ├── ChangeLog
    ├── LICENSE
    ├── Makefile.am
    ├── README
    ├── autogen.sh
    ├── autosa_catapult_hls_c.cpp
    ├── autosa_catapult_hls_c.h
    ├── autosa_codegen.cpp
    ├── autosa_codegen.h
    ├── autosa_comm.cpp
    ├── autosa_comm.h
    ├── autosa_common.cpp
    ├── autosa_common.h
    ├── autosa_cpu.cpp
    ├── autosa_cpu.h
    ├── autosa_intel_opencl.cpp
    ├── autosa_intel_opencl.h
    ├── autosa_print.cpp
    ├── autosa_print.h
    ├── autosa_schedule_tree.cpp
    ├── autosa_schedule_tree.h
    ├── autosa_t2s.cpp
    ├── autosa_tapa_cpp.cpp
    ├── autosa_tapa_cpp.h
    ├── autosa_trans.cpp
    ├── autosa_trans.h
    ├── autosa_tuning.cpp
    ├── autosa_tuning.h
    ├── autosa_utils.cpp
    ├── autosa_utils.h
    ├── autosa_xilinx_hls_c.cpp
    ├── autosa_xilinx_hls_c.h
    ├── configure.ac
    ├── cpu.c
    ├── cpu.h
    ├── examples/
    │   └── chemv.c
    ├── get_submodules.sh
    ├── grouping.c
    ├── grouping.h
    ├── hybrid.c
    ├── hybrid.h
    ├── json.hpp
    ├── m4/
    │   ├── ax_check_opencl.m4
    │   ├── ax_check_openmp.m4
    │   ├── ax_detect_git_head.m4
    │   └── ax_submodule.m4
    ├── main.cpp
    ├── ocl_utilities.c
    ├── ocl_utilities.h
    ├── opencl_test.sh.in
    ├── polybench_test.sh.in
    ├── ppcg.c
    ├── ppcg.h
    ├── ppcg_files/
    │   ├── cuda.c
    │   ├── cuda.h
    │   ├── cuda_common.c
    │   ├── cuda_common.h
    │   ├── gpu.c
    │   ├── gpu.h
    │   ├── gpu_array_tile.c
    │   ├── gpu_array_tile.h
    │   ├── gpu_group.c
    │   ├── gpu_group.h
    │   ├── gpu_hybrid.c
    │   ├── gpu_hybrid.h
    │   ├── gpu_print.c
    │   ├── gpu_print.h
    │   ├── gpu_tree.c
    │   ├── gpu_tree.h
    │   ├── opencl.c
    │   └── opencl.h
    ├── ppcg_options.c
    ├── ppcg_options.h
    ├── print.c
    ├── print.h
    ├── schedule.c
    ├── schedule.h
    ├── tests/
    │   ├── call.c
    │   ├── call2.c
    │   ├── call2_opencl_functions.cl
    │   ├── call3.c
    │   ├── call3_opencl_functions.cl
    │   ├── call4.c
    │   ├── call5.c
    │   ├── call_opencl_functions.cl
    │   ├── dead.c
    │   ├── iterator.c
    │   ├── live_out.c
    │   ├── local.c
    │   ├── loop.c
    │   ├── not_accessed.c
    │   ├── not_accessed_opencl_functions.cl
    │   ├── scalar.c
    │   ├── shared_sink.c
    │   ├── struct.c
    │   ├── struct2.c
    │   ├── struct3.c
    │   ├── struct4.c
    │   └── struct5.c
    ├── util.c
    ├── util.h
    └── version.c

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
src/.deps/
src/.libs/
src/Makefile
src/Makefile.in
src/aclocal.m4
src/autom4te.cache/
src/compile
src/config.guess
src/config.log
src/config.status
src/config.sub
src/configure
src/depcomp
src/gitversion.h
src/install-sh
src/libtool
src/ltmain.sh
src/m4/libtool.m4
src/m4/ltoptions.m4
src/m4/ltsugar.m4
src/m4/ltversion.m4
src/m4/lt~obsolete.m4
src/missing
src/ppcg
src/test-driver
src/build
src/opencl_test.sh
src/polybench_test.sh
src/.nfs*
src/*.o
src/.vscode
src/autosa
src/tags

autosa
autosa.tmp
.nfs*


================================================
FILE: .gitignore
================================================
src/.deps/
src/.libs/
src/Makefile
src/Makefile.in
src/aclocal.m4
src/autom4te.cache/
src/compile
src/config.guess
src/config.log
src/config.status
src/config.sub
src/configure
src/depcomp
src/gitversion.h
src/install-sh
src/libtool
src/ltmain.sh
src/m4/libtool.m4
src/m4/ltoptions.m4
src/m4/ltsugar.m4
src/m4/ltversion.m4
src/m4/lt~obsolete.m4
src/missing
src/ppcg
src/test-driver
src/build
src/opencl_test.sh
src/polybench_test.sh
src/.nfs*
src/*.o
src/.vscode
src/autosa
src/tags

autosa
autosa.tmp
.nfs*
.vscode
.libs
autosa_scripts/__pycache__
docs/_build
autosa_scripts/tuner/__pycache__
autosa_scripts/tuner/outdir

autosa_scripts/odyssey/db/*
autosa_scripts/odyssey/outdir/*
autosa_scripts/odyssey/__pycache__
autosa_scripts/odyssey/tmp/*
autosa_scripts/odyssey/solver/*
autosa_scripts/odyssey/designs/register


================================================
FILE: .gitmodules
================================================
[submodule "src/isl"]
	path = src/isl
	url = git://repo.or.cz/isl.git
[submodule "src/pet"]
	path = src/pet
	url = git://repo.or.cz/pet.git
[submodule "src/cJSON"]
	path = src/cJSON
	url = https://github.com/DaveGamble/cJSON.git
[submodule "src/barvinok"]
	path = src/barvinok
	url = https://repo.or.cz/barvinok.git


================================================
FILE: ChangeLog
================================================
version: 0.01
2020-5-10 Jie Wang <jiewang@cs.ucla.edu>
changes:
  - initial release of AutoSA


================================================
FILE: Dockerfile
================================================
# Get the base Ubuntu image from Docker Hub
FROM ubuntu:latest
LABEL maintainer="jiewang@cs.ucla.edu"
ENV DEBIAN_FRONTEND=noninteractive 

# Update apps on the base image
RUN apt-get -y update && apt-get install -y

# Install the prerequisites
RUN apt-get -y install apt-utils automake autoconf libtool libtool-bin pkg-config libgmp3-dev libyaml-dev python3.6 python3-pip git wget cmake vim gdb  
RUN apt-get -y install libllvm-9-ocaml-dev libllvm9 llvm-9 llvm-9-dev llvm-9-doc llvm-9-examples llvm-9-runtime clang-9 clang-tools-9 clang-9-doc libclang-common-9-dev libclang-9-dev libclang1-9 clang-format-9 python-clang-9 clangd-9
RUN ln -s /usr/bin/llvm-config-9 /usr/bin/llvm-config

# Install NTL for barvinok
RUN mkdir /ntl
WORKDIR /ntl
RUN wget https://www.shoup.net/ntl/ntl-11.4.3.tar.gz
RUN gunzip ntl-11.4.3.tar.gz
RUN tar xf ntl-11.4.3.tar
WORKDIR /ntl/ntl-11.4.3/src
RUN ./configure NTL_GMP_LIP=on
RUN make -j4
RUN make install

# Copy the current folder to the Docker image
COPY . /usr/src/docker_autosa

# Specify the working directory
WORKDIR /usr/src/docker_autosa

# Install AutoSA
RUN ./install.sh


================================================
FILE: LICENSE
================================================
MIT License (MIT)

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
<div align="center">
  <img src=".github/autosa_logo.png", width="200">
</div>

# AutoSA: Polyhedral-Based Systolic Array Auto-Compilation

[Documentation](https://autosa.readthedocs.io/en/latest/) |
[Installation](https://autosa.readthedocs.io/en/latest/installation.html) |
[Tutorials](https://autosa.readthedocs.io/en/latest/tutorials/index.html) |
[Examples](https://autosa.readthedocs.io/en/latest/examples/index.html)

This repository includes the code for AutoSA. AutoSA is an end-to-end systolic array compiler based on the polyhedral model. It takes algorithms in high-level programming languages (C) as inputs, performs polyhedral transformation and other architecture optimizations to map algorithms to systolic array architecture. The generated designs are in HLS C.

## Quick Start
We offer a Docker image for quick start.
```bash
docker pull whbldhwj/autosa:latest
```

Let's try one small example. The input code can be found at `${AUTOSA_ROOT}/autosa_tests/mm/kernel.c`. The code region to be transformed to systolic array is annotated using a pair of pragmas `scop` and `endscop`.

1. Generating HLS C Code.

Run the following command to compile generate a systolic array.
```c
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize
```
The generated code can be found in `${AUTOSA_ROOT}/autosa.tmp/output/src/`.
For detailed explaination of each AutoSA compilation option, please run
```c
./autosa --help
```
or refer to [AutoSA Compilation Options](https://autosa.readthedocs.io/en/latest/tutorials/getting_started.html#autosa-compilation-options).

2. Generating FPGA Bitstream

To generate the final bitsteam, set up your local Vitis development kit first.
Then execute the makefile to build the design.
```
cp ${AUTOSA_ROOT}/autosa_tests/mm/Makefile autosa.tmp/output/
cp ${AUTOSA_ROOT}/autosa_tests/mm/connectivity.cfg autosa.tmp/output/
cd ${AUTOSA_ROOT}/autosa.tmp/output
make all
```
**Makefile Options Descriptions**

* `MODE := hw_emu`: Set the build configuration mode to HW Emulation, other modes: sw_emu|hw
* `PLATFORM := xilinx_u250_xdma_201830_2`: Select the target platform
* `KERNEL_SRC := src/kernel_kernel.cpp`: List the kernel source files
* `HOST_SRC := src/kernel_host.cpp`: List the host source files

The `connectivity.cfg` describes the DRAM port mapping. For more details about how to change the DRAM port mapping, please refer to the Xilinx tutorials.

3. Verifying Designs Using Xilinx HLS

AutoSA also supports generate HLS projects. Add the flag
```
--hls
```
to the command when compiling the program.

```c
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls
```

AutoSA will generate an HLS host file `${AUTOSA_ROOT}/autosa.tmp/output/src/kernel_host.cpp` instead of the OpenCL host file generated in the previous step. To build the HLS project, run the following commands.
```
cp ${AUTOSA_ROOT}/autosa_scripts/hls_scripts/hls_script.tcl autosa.tmp/output/
cd ${AUTOSA_ROOT}/autosa.tmp/output
vivado_hls -f hls_script.tcl
```

For more detailed instructions on using AutoSA, please refer to the [AutoSA Documentation](https://autosa.readthedocs.io/en/latest/).

## Send Us Failure Cases and Feedback!
AutoSA is open source for research purposes, and we would like to continously improve it! Please let us know if...

1. you find any bug in the AutoSA code.
2. you find any application that fails the compilation flow of AutoSA.
3. you know how to further help improve any part of the compiler.
4. etc.

## Authors and Contributors
AutoSA is currently maintained by [Jie Wang](http://cadlab.cs.ucla.edu/~jaywang/).
Besides, we gratefully acknowledge the authors of [PPCG](https://github.com/Meinersbur/ppcg) for developing and actively maintaining PPCG as an open-source project.

## Papers
More implementation details of AutoSA are covered in [our paper](http://cadlab.cs.ucla.edu/~jaywang/papers/fpga21-autosa.pdf). If you find this project useful in your research, please consider citing:

    @inproceedings{wang2021autosa,
      title={AutoSA: A Polyhedral Compiler for High-Performance Systolic Arrays on FPGA},
      author={Wang, Jie and Guo, Licheng and Cong, Jason},
      booktitle={Proceedings of the 2021 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
      year={2021}
    }


================================================
FILE: autosa_config/autosa_config.json
================================================
{
    "space_time": {
        "mode": "manual"
    },
    "array_part": {
        "enable": 1,
        "mode": "manual"
    },
    "array_part_L2": {
        "enable": 1,
        "mode": "manual"
    },
    "latency": {
        "enable": 1,
        "mode": "manual"
    },
    "simd": {
        "enable": 1,
        "mode": "manual"
    },
    "hbm": {
        "mode": "manual"
    }
}


================================================
FILE: autosa_config/hw_info.json
================================================
{
  "BRAM18K": 5376,
  "DSP": 12288,
  "FF": 3456000,
  "LUT": 1728000,
  "URAM": 1280
}


================================================
FILE: autosa_config/hw_info_libs/hw_info.json.ku3
================================================
{
  "BRAM": 2160,
  "DSP": 2760,
  "FF": 663360,
  "LUT": 331680,
  "URAM": 0
}


================================================
FILE: autosa_config/hw_info_libs/hw_info.json.u200
================================================
{
  "BRAM": 4320,
  "DSP": 6840,
  "FF": 2364480,
  "LUT": 1182240,
  "URAM": 960
}


================================================
FILE: autosa_config/hw_info_libs/hw_info.json.u250
================================================
{
  "BRAM18K": 5376,
  "DSP": 12288,
  "FF": 3456000,
  "LUT": 1728000,
  "URAM": 1280
}


================================================
FILE: autosa_config/module_group.json
================================================
{
  "x": 8,
  "y": 1
}


================================================
FILE: autosa_config/optimizer_settings.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 4
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    32,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.6,
                        0.75
                    ],
                    "BRAM18K": [
                        0.3,
                        0.7
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    190,
                    210
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    1280
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    190,
                    210
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}


================================================
FILE: autosa_config/optimizer_settings_libs/gemm3_fp32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    16,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.6,
                        0.75
                    ],
                    "BRAM18K": [
                        0.3,
                        0.7
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    190,
                    210
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    1280
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    190,
                    210
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/gemm3_int16.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 16
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.3,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.7
                    ],
                    "DSP": [
                        0.6,
                        0.75
                    ],
                    "BRAM18K": [
                        0.2,
                        0.7
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    480,
                    640
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    1280
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    480,
                    640
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 16
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/gemm3_int16_32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    16,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.5,
                        0.7
                    ],
                    "BRAM18K": [
                        0.3,
                        0.7
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    200,
                    300
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    1024
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    200,
                    300
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 32
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/gemm3_int8.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    16,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.5,
                        0.7
                    ],
                    "BRAM18K": [
                        0.3,
                        0.75
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    350,
                    450
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    400,
                    1500
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    350,
                    450
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 32
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/gemm3_int8_64.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    16,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.5,
                        0.7
                    ],
                    "BRAM18K": [
                        0.3,
                        0.75
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    150,
                    200
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    256,
                    1000
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    150,
                    200
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/gemm4_fp32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.5,
                        0.7
                    ],
                    "BRAM18K": [
                        0.3,
                        0.7
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    200,
                    210
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    512
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    200,
                    210
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 32
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/mm_small.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 4
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    32,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.0,
                        0.5
                    ],
                    "BRAM18K": [
                        0.0,
                        0.5
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    32,
                    128
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    32,
                    512
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    32,
                    128
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}


================================================
FILE: autosa_config/optimizer_settings_libs/mttkrp_fp32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    80,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": -1,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.6,
                        0.7
                    ],
                    "BRAM18K": [
                        0.2,
                        0.5
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    120,
                    130
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    70,
                    512
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    120,
                    130
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/ttm_fp32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": 3,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.6,
                        0.7
                    ],
                    "BRAM18K": [
                        0.1,
                        0.5
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    190,
                    200
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    640
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    190,
                    200
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}

================================================
FILE: autosa_config/optimizer_settings_libs/ttmc_fp32.json
================================================
{
    "training": {
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "random",
                "n": 2,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": 2,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": 2,
                "loop_limit": 8
            }
        },
        "pruning": {
            "array_part": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    80,
                    256
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    8,
                    32
                ],
                "PE_ratio": 2
            }
        },
        "multiprocess": {
            "n_job": 1
        }
    },
    "synth": {
        "multiprocess": {
            "n_job": 16
        },
        "sample": {
            "n": 16
        }
    },
    "search": {
        "metric": "latency",
        "cycle_period": 5,
        "mode": "customized",
        "n_random": 5,
        "log": {
            "n_record": 10
        },
        "resource_target": ["BRAM18K", "DSP"],
        "time_out": 5,
        "update_time_interval": 2,        
        "pruning": {
            "random_start": {
                "enable": 1,
                "n_trial": 3,
                "n_random": 3
            },
            "resource": {                
                "range": {
                    "FF": [
                        0.25,
                        0.7
                    ],
                    "LUT": [
                        0.3,
                        0.75
                    ],
                    "DSP": [
                        0.6,
                        0.7
                    ],
                    "BRAM18K": [
                        0.1,
                        0.5
                    ],
                    "URAM": [
                        0,
                        0.6
                    ]
                }
            },
            "array_part": {
                "enable": 1,
                "PE_num": [
                    120,
                    140
                ]
            },
            "array_part_L2": {
                "enable": 1
            },
            "latency_hiding": {
                "enable": 1,
                "reg_size": [
                    64,
                    640
                ]
            },
            "SIMD_vectorization": {
                "enable": 1,
                "PE_num": [
                    120,
                    140
                ],
                "PE_ratio": 3
            }
        },
        "multiprocess": {
            "n_job": 32
        },
        "sample": {
            "space_time": {
                "mode": "exhaustive",
                "n": -1
            },
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    }
}

================================================
FILE: autosa_scripts/autosa.py
================================================
#!/usr/bin/env python3
import sys
import subprocess
import os
import time

def exec_sys_cmd(cmd):
    p = subprocess.Popen(cmd, shell=True)
    ret = p.wait()
    return ret

if __name__ == "__main__":
    # Some default values
    output_dir = './autosa.tmp/output'
    target = 'autosa_hls_c'
    src_file_prefix = 'kernel'
    xilinx_host = 'opencl'
    tuning = False
    isl_flag = '--isl-schedule-whole-component' # This flag forces ISL to perform loop fusion as much as possible
    hcl = False

    # Parse and update the arguments
    n_arg = len(sys.argv)
    argv = sys.argv
    tuning_idx = -1
    insert_isl_flag = True
    assign_loop_permute = False
    explore_loop_permute = False
    for i in range(n_arg):
        arg = argv[i]            
        if 'output-dir' in arg:
            output_dir = arg.split('=')[-1]
        if 'target' in arg:
            target = arg.split('=')[-1]
        if 'tuning-method' in arg:            
            tuning = True
            tuning_idx = i
        if 'isl-schedule-whole-component' in arg:
            insert_isl_flag = False
        if 'loop-permute-order' in arg:
            assign_loop_permute = True
        if 'explore-loop-permute' in arg:
            explore_loop_permute = True
    if n_arg > 1:
        src_file = argv[1]
        src_file_prefix = os.path.basename(src_file).split('.')[0]
    if n_arg > 1 and target == 'autosa_hls_c':
        # Check whether to generate HLS or OpenCL host for Xilinx FPGAs
        for arg in argv:
            if '--hls' in arg:
                xilinx_host = 'hls'
            if '--hcl' in arg:
                hcl = True    
    if n_arg > 1 and target == 'autosa_opencl':
        for arg in argv:
            if '--hcl' in arg:
                hcl = True    
   
    # Cache the AutoSA command
    autosa_cmd = ' '.join(argv)
    exec_sys_cmd(f'echo "{autosa_cmd}" > {output_dir}/src/cmd')

    argv[0] = './src/autosa'
    if insert_isl_flag:
        argv.append(isl_flag)

    # Check if the output directory exists
    if not os.path.isdir(output_dir):
        raise RuntimeError('Output directory is not specified.')

    # Execute the AutoSA        
    #start_time = time.perf_counter()
    complete = False
    permute_idx = 0
    while not complete:
        if permute_idx > 0:
            argv.append(f'--autosa-loop-permute-order={permute_idx}')
        process = subprocess.run(argv)
        if process.returncode != 0:
            print("[AutoSA] Error: Exit abnormally!")
            sys.exit(process.returncode)
        else:        
            if not os.path.exists(output_dir + '/src/completed'):
                sys.exit(process.returncode)    
        exec_sys_cmd(f'rm {output_dir}/src/completed')                   
        #runtime = time.perf_counter() - start_time
        #print(f'runtime: {runtime}')

        # Generate the top module
        print("[AutoSA] Post-processing the generated code...")
        #start_time = time.perf_counter()
        if not os.path.exists(f'{output_dir}/src/{src_file_prefix}_top_gen.cpp'):
            raise RuntimeError(f'{output_dir}/src/{src_file_prefix}_top_gen.cpp not exists.')
        cmd = 'g++ -o ' + output_dir + '/src/top_gen ' + output_dir + \
              '/src/' + src_file_prefix + '_top_gen.cpp ' + \
              '-I./src/isl/include -L./src/isl/.libs -lisl'
        exec_sys_cmd(cmd)
        my_env = os.environ.copy()
        cwd = os.getcwd()
        if 'LD_LIBRARY_PATH' in my_env:
            my_env['LD_LIBRARY_PATH'] += os.pathsep + cwd + '/src/isl/.libs'
        else:
            my_env['LD_LIBRARY_PATH'] = os.pathsep + cwd + '/src/isl/.libs'
        cmd = output_dir + '/src/top_gen'
        process = subprocess.run(cmd.split(), env=my_env)
        #runtime = time.perf_counter() - start_time
        #print(f'runtime: {runtime}')

        complete = True     
        if tuning and explore_loop_permute:   
            for filename in os.listdir(f'{output_dir}'):
                if filename.startswith("permute"):
                    if filename.endswith("done"):
                        complete = True                    
                    else:
                        permute_idx = int(filename.split("_")[-1])                        
                        if assign_loop_permute:
                            complete = True
                        else:
                            complete = False                        

                    os.remove(f'{output_dir}/{filename}')
                    break            

    if not tuning:
        # Generate the final code    
        if target == 'autosa_hls_c' or target == 'autosa_tapa':
            cmd = './autosa_scripts/codegen.py -c ' + output_dir + \
                  '/src/top.cpp -d ' + output_dir + '/src/' + src_file_prefix + \
                  '_kernel_modules.cpp -t ' + target + ' -o ' + output_dir + '/src/' + \
                  src_file_prefix + '_kernel.cpp'
            if hcl:
                cmd += ' --hcl'
        elif target == 'autosa_opencl':
            cmd = './autosa_scripts/codegen.py -c ' + output_dir + \
                  '/src/top.cpp -d ' + output_dir + '/src/' + src_file_prefix + \
                  '_kernel_modules.cl -t ' + target + ' -o ' + output_dir + '/src/' + \
                  src_file_prefix + '_kernel.cl'
            if hcl:
                cmd += ' --hcl'
        elif target == 'autosa_catapult_c':
            cmd = './autosa_scripts/codegen.py -c ' + output_dir + \
                  '/src/top.cpp -d ' + output_dir + '/src/' + src_file_prefix + \
                  '_kernel_modules.cpp -t ' + target + ' -o ' + output_dir + '/src/' + \
                  src_file_prefix + '_kernel_hw.h' + ' --tb ' + output_dir + '/src/' + \
                  src_file_prefix + '_host.cpp'
        if target == 'autosa_hls_c':
            cmd += ' --host '
            cmd += xilinx_host
                    
        exec_sys_cmd(cmd)            

        # Copy the input code to the output directory           
        exec_sys_cmd(f'cp {argv[1]} {output_dir}/src/')
        headers = src_file.split('.')
        headers[-1] = 'h'
        headers = ".".join(headers)
        if os.path.exists(headers):
            exec_sys_cmd(f'cp {headers} {output_dir}/src/')        

        # Clean up the temp files        
        if target == 'autosa_hls_c' and xilinx_host == 'opencl':
            exec_sys_cmd(f'rm {output_dir}/src/{src_file_prefix}_kernel.h')            
        exec_sys_cmd(f'rm {output_dir}/src/top_gen')
        exec_sys_cmd(f'rm {output_dir}/src/top.cpp')
        exec_sys_cmd(f'rm {output_dir}/src/{src_file_prefix}_top_gen.cpp')    
        exec_sys_cmd(f'rm {output_dir}/src/{src_file_prefix}_top_gen.h')    
        if target == 'autosa_hls_c' or target == 'autosa_catapult_c':
            exec_sys_cmd(f'rm {output_dir}/src/{src_file_prefix}_kernel_modules.cpp')
        elif target == 'autosa_opencl':
            exec_sys_cmd(f'rm {output_dir}/src/{src_file_prefix}_kernel_modules.cl')        


================================================
FILE: autosa_scripts/codegen.py
================================================
#!/usr/bin/env python3

import sympy
import sys
import argparse
import re
import numpy as np
import os

def delete_arg_from_arg_list(line, arg, content):
    """ Delete the argument from the argument list

    Parameters
    ----------
    line: list
        codeline containing the argument list
    arg: list
        argument to be deleted
    line_id: int
        the current line id
    content: list
        the printed content before current line
    """
    line = line.strip()
    # print(line)
    if line[-1] != ',':
        # print('test\n')
        # print(line)
        # print(content[-1])
        comma_pos = content[-1].find(',')
        content[-1] = content[-1][:comma_pos] + '\n'

    """
    line = re.sub(r'( )(' + re.escape(arg) + r')(,)',
                  '', line)
    line = re.sub(r'( )(' + re.escape(arg) + r')(\))',
                  r'\g<3>', line)
    line = re.sub(r'(\()(' + re.escape(arg) + r')(, )',
                  r'\g<1>', line)
    line = re.sub(r'(\()(' + re.escape(arg) + r')(\))',
                  r'\g<1>\g<3>', line)
    """

def print_module_def(
        f,
        arg_map,
        module_def,
        inline_module_defs,
        def_args,
        call_args_type):
    """ Print out module definitions for Intel OpenCL

    This function prints out the module definition with all arguments in the code
    replaced by the calling arguments.
    We will first extract the module ids and fifos from the module definition
    argument lists. These arguments are deleted from the argument lists as we will
    plug in the exact module ids and fifos from a call of this modules.
    As an example, the original module
      void A_IO_L3_in(int idx, fifo_type fifo)
    will be modified to
      void A_IO_L3_in_[arg_map[idx]]()

    Parameters
    ----------
    f:
        file handle
    arg_map:
        maps from module definition args to module call args
    module_def:
        a list storing the module definition texts
    inline_module_defs:
        a dict containing all the inline module definitions
    def_args:
        a list storing the module definition arguments
    call_args_type:
        a list storing the type of each module call arg
    """
    # Print inline module definitions
    if inline_module_defs:
        # Each inline module should be only printed once.
        # We assume the module ids and fifos are unchanged in multiple inline module
        # calls. Therefore, only the first encounter will be handled.
        inline_module_handled = []
        for inline_module in inline_module_defs:
            # Search for the inline modules
            for line_id in range(len(module_def)):
                line = module_def[line_id]
                if line.find(inline_module + '(') != -1:
                    # The current line contains the inline module call
                    if inline_module in inline_module_handled:
                        # Replace the module call
                        line_indent = line.find(inline_module)
                        line = ' ' * line_indent + inline_module
                        for i in range(len(def_args)):
                            def_arg = def_args[i]
                            arg_type = call_args_type[i]
                            if arg_type == 'module id':
                                line += '_'
                                line += arg_map[def_arg]
                        line += '(\n'
                        module_def[line_id] = line
                        continue
                    else:
                        inline_module_handled.append(inline_module)
                    # Print the inline module definition
                    inline_module_call_args = []
                    inline_module_call_args_type = []
                    inline_module_def_args = []
                    inline_module_arg_map = {}
                    inline_module_name = inline_module
                    inline_module_def = inline_module_defs[inline_module_name]
                    # Extract the arg list in module definition
                    for inline_module_line in inline_module_def:
                        if inline_module_line.find('void') != -1:
                            m = re.search(r'\((.+?)\)', inline_module_line)
                            if m:
                                def_args_old = m.group(1)
                    def_args_old = def_args_old.split(', ')
                    for arg in def_args_old:
                        arg = arg.split()[-1]
                        inline_module_def_args.append(arg)
                    # Extract the arg list in module call
                    next_line_id = line_id + 1
                    next_line = module_def[next_line_id]
                    while next_line.find(');') == -1:
                        m = re.search(r'/\*(.+?)\*/', next_line)
                        if m:
                            arg_type = m.group(1).strip()
                            inline_module_call_args_type.append(arg_type)
                            m = re.search(r'\*/ (.+)', next_line)
                            if m:
                                call_arg = m.group(1).split(',')[0]
                                inline_module_call_args.append(call_arg)
                        next_line_id += 1
                        next_line = module_def[next_line_id]
                    # Build a mapping between the def_arg to call_arg
                    #print(inline_module_def_args)
                    #print(inline_module_call_args)
                    for i in range(len(inline_module_def_args)):
                        def_arg = inline_module_def_args[i]
                        call_arg = inline_module_call_args[i]
                        inline_module_arg_map[def_arg] = call_arg
                    # Replace the module ids and fifos from the upper module
                    for def_arg in inline_module_arg_map:
                        call_arg = inline_module_arg_map[def_arg]
                        if call_arg in arg_map:
                            inline_module_arg_map[def_arg] = arg_map[call_arg]
                    print_module_def(
                        f,
                        inline_module_arg_map,
                        inline_module_def.copy(),
                        None,
                        inline_module_def_args,
                        inline_module_call_args_type)
                    # Replace the inline module call with the new inline module
                    # name
                    line_indent = line.find(inline_module)
                    line = ' ' * line_indent + inline_module
                    for i in range(len(def_args)):
                        def_arg = def_args[i]
                        arg_type = call_args_type[i]
                        if arg_type == 'module id':
                            line += '_'
                            line += arg_map[def_arg]
                    line += '(\n'
                    module_def[line_id] = line

    # Extract module ids and fifos from def_args
    module_id_args = []
    fifo_args = []
    # print(def_args)
    # print(call_args_type)
    for i in range(len(def_args)):
        def_arg = def_args[i]
        arg_type = call_args_type[i]
        if arg_type == 'module id':
            module_id_args.append(def_arg)
        if arg_type == 'fifo':
            fifo_args.append(def_arg)

    # Start printing
    print_content = []
    print_content.append('/* Module Definition */\n')
    line_id = 0
    for line in module_def:
        if line.find('void') != -1:
            # This line is kernel argument.
            # All module id and fifo arguments are deleted
            m = re.search(r'(.+?)\(', line)
            if m:
                prefix = m.group(1)
            arg_start_pos = line.find('(')
            arg_end_pos = line.rfind(')')
            def_args = line[arg_start_pos + 1 : arg_end_pos]
            #m = re.search(r'\((.+?)\)', line)
            #if m:
            #    def_args = m.group(1)
            def_args = def_args.split(', ')
            new_def_args = []
            for i in range(len(def_args)):
                if call_args_type[i] != 'module id' and call_args_type[i] != 'fifo':
                    new_def_args.append(def_args[i])
            # f.write(prefix + '(')
            # Print the module_name
            print_content.append(prefix)
            for module_id in module_id_args:
                print_content.append('_' + arg_map[module_id])
            print_content.append('(')
            first = True
            for arg in new_def_args:
                if not first:
                    print_content.append(', ')
                print_content.append(arg)
                first = False
            #print_content.append(')\n')
            print_content.append(line[arg_end_pos:])
        else:
            # module ids
            for module_id in module_id_args:
                if line.find(module_id) != -1:
                    # Test if it is inside an argument list
                    m = re.search(
                        r'/\* module id \*/ ' +
                        re.escape(module_id),
                        line)
                    if m:
                        # Delete if from the argument list
                        delete_arg_from_arg_list(
                            line, module_id, print_content)
                        line = None
                        break
                    else:
                        # Plug in module ids
                        line = re.sub(
                            r'([^a-zA-Z_])(' +
                            re.escape(module_id) +
                            r')([^a-zA-Z0-9_])',
                            r'\g<1>' +
                            re.escape(
                                arg_map[module_id]) +
                            r'\g<3>',
                            line)
            # fifos
            if line:
                for fifo in fifo_args:
                    if line.find(fifo) != -1:
                        # Test if it is inside a read/write API call
                        if line.find('read_channel_intel') != - \
                                1 or line.find('write_channel_intel') != -1:
                            # Plug in fifos
                            line = re.sub(
                                r'([^a-zA-Z_])(' +
                                re.escape(fifo) +
                                r')([^a-zA-Z0-9_])',
                                r'\g<1>' +
                                re.escape(
                                    arg_map[fifo]) +
                                r'\g<3>',
                                line)
                        else:
                            # Test if it is inside an argument list
                            m = re.search(
                                r'/\* fifo \*/ ' + re.escape(fifo), line)
                            if m:
                                # Delete it from the argument list
                                delete_arg_from_arg_list(
                                    line, fifo, print_content)
                                line = None
                                break
            if line is not None:
                print_content.append(line)
        line_id += 1
    print_content.append('/* Module Definition */\n\n')

    f.writelines(print_content)


def generate_intel_kernel(
        kernel,
        headers,
        module_defs,
        module_calls,
        fifo_decls):
    """ Generate the final Intel code

    This function plugs in the module definitions into each module call and replace
    index ids and fifo arguments.

    Parameters
    ----------
    kernel:
        the output file
    headers:
        list containing the headers to be printed
    module_defs:
        dict containing the module definitions
    module_calls:
        list containing the module calls
    fifo_decls:
        list containing the fifo declarations
    """
    inline_module_defs = {}
    with open(kernel, 'w') as f:
        # Print out headers
        for header in headers:
            f.write(header)
        f.write('\n')

        f.write('#pragma OPENCL EXTENSION cl_intel_channels : enable\n\n')

        # Print out channels
        f.write('/* Channel Declaration */\n')
        for fifo_decl in fifo_decls:
            f.write(fifo_decl + '\n')
        f.write('/* Channel Declaration */\n\n')

        # Extract the inline modules
        # These modules are those that exist in the module_defs but not in the
        # module_calls.
        for module_name in module_defs:
            inline_module = 1
            for module_call in module_calls:
                line = module_call[0]
                m = re.search(r'(.+?)\(', line)
                if m:
                    cur_module_name = m.group(1)
                if module_name == cur_module_name:
                    inline_module = 0
                    break
            if inline_module:
                inline_module_defs[module_name] = module_defs[module_name]

        # print out module definitions
        for module_call in module_calls:
            # f.write('/* Module Definition */\n')
            def_args = []
            call_args = []
            call_args_type = []
            arg_map = {}
            # Extract the module name
            line = module_call[0]
            m = re.search(r'(.+?)\(', line)
            if m:
                module_name = m.group(1)
            module_def = module_defs[module_name]
            # extract the arg list in module definition
            for line in module_def:
                if line.find('void') != -1:
                    arg_start_pos = line.find('(')
                    arg_end_pos = line.rfind(')')
                    def_args_old = line[arg_start_pos + 1 : arg_end_pos]
                    #m = re.search(r'\((.+?)\)', line)
                    #if m:
                    #    def_args_old = m.group(1)
            def_args_old = def_args_old.split(', ')
            for arg in def_args_old:
                arg = arg.split()[-1]
                def_args.append(arg)

            # extract the arg list in module call
            for line in module_call:
                m = re.search(r'/\*(.+?)\*/', line)
                if m:
                    arg_type = m.group(1).strip()
                    call_args_type.append(arg_type)
                    n = re.search(r'\*/ (.+)', line)
                    if n:
                        call_arg = n.group(1).strip(',')
                        call_args.append(call_arg)

            # build a mapping between the def_arg to call_arg
            for i in range(len(def_args)):
                call_arg_type = call_args_type[i]
                if call_arg_type == 'module id' or call_arg_type == 'fifo':
                    def_arg = def_args[i]
                    call_arg = call_args[i]
                    arg_map[def_arg] = call_arg

            # print out the module definition with call args plugged in
            print_module_def(
                f,
                arg_map,
                module_def.copy(),
                inline_module_defs,
                def_args,
                call_args_type)
            # f.write('/* Module Definition */\n\n')

def contains_pipeline_for(pos, lines):
    """ Examine if there is any for loop with hls_pipeline annotation inside the current for loop

    """
    n_l_bracket = 0
    n_r_bracket = 0
    code_len = len(lines)
    init_state = 1
    while pos < code_len and n_r_bracket <= n_l_bracket:
        if lines[pos].find('{') != -1:
            n_l_bracket += 1
        if lines[pos].find('}') != -1:
            n_r_bracket += 1
        if lines[pos].find('for') != -1:
            if init_state:
                init_state = 0
            else:
                if lines[pos + 1].find('hls_pipeline') != -1:
                    return 1
        if n_l_bracket == n_r_bracket and not init_state:
            break
        pos += 1
    return 0


def insert_xlnx_pragmas(lines):
    """ Insert HLS pragmas for Xilinx program

    Replace the comments of "// hls_pipeline" and "// hls_unroll" with
    HLS pragmas
    For "// hls pipeline", find the previous for loop before hitting any "}".
    Insert "#pragma HLS PIPELINE II=1" below the for loop.
    For "// hls unroll", find the previous for loop before hitting the "simd" mark.
    Insert "#pragma HLS UNROLL" below the for loop.
    For "// hls_dependence.x", the position is the same with hls_pipeline.
    Insert "#pragma HLS DEPENDENCE variable=x inter false".

    Parameters
    ----------
    lines:
        contains the codelines of the program
    """
    # Handle hls_dependence
    handle_dep_pragma = 1

    code_len = len(lines)
    pos = 0
    while pos < code_len:
        line = lines[pos]
        if line.find("// hls_pipeline") != - \
                1 or line.find("// hls_dependence") != -1:
            is_pipeline = 0
            is_dep = 0
            if line.find('// hls_pipeline') != -1:
                is_pipeline = 1
            else:
                is_dep = 1
            # Find if there is any other hls_pipeline/hls_dependence annotation
            # below
            n_l_bracket = 0
            n_r_bracket = 0
            next_pos = pos + 1
            find_pipeline = 0
            init_state = 1
            while next_pos < code_len and n_r_bracket <= n_l_bracket:
                if is_pipeline and lines[next_pos].find('hls_pipeline') != -1:
                    find_pipeline = 1
                    break
                if is_dep and lines[next_pos].find(
                        'hls_dependence') != -1 and handle_dep_pragma:
                    find_pipeline = 1
                    break
                if lines[next_pos].find('{') != -1:
                    n_l_bracket += 1
                    init_state = 0
                if lines[next_pos].find('}') != -1:
                    n_r_bracket += 1
                if n_l_bracket == n_r_bracket and not init_state:
                    break
                next_pos += 1
            if find_pipeline:
                pos += 1
                continue

            # Find the for loop above before hitting any "}"
            prev_pos = pos - 1
            find_for = 0
            n_l_bracket = 0
            n_r_bracket = 0
            while prev_pos >= 0:
                if lines[prev_pos].find('while') != -1:
                    break
                if lines[prev_pos].find('{') != -1:
                    n_l_bracket += 1
                if lines[prev_pos].find('}') != -1:
                    n_r_bracket += 1
                if lines[prev_pos].find('for') != -1:
                    if n_l_bracket > n_r_bracket:
                        # check if the pragma is already inserted
                        if is_pipeline and lines[prev_pos +
                                                 1].find('#pragma HLS PIPELINE II=1\n') == -1:
                            find_for = 1
                        if is_dep and lines[prev_pos + 2].find(
                                '#pragma HLS DEPENDENCE') == -1 and handle_dep_pragma:
                            find_for = 1
                        # check if there is any other for loop with
                        # hls_pipeline annotation inside
                        if contains_pipeline_for(prev_pos, lines):
                            find_for = 0
                        break
                prev_pos -= 1
            if find_for == 1:
                # insert the pragma right after the for loop
                indent = lines[prev_pos].find('for')
                if line.find("hls_pipeline") != -1:
                    new_line = ' ' * indent + "#pragma HLS PIPELINE II=1\n"
                else:
                    line_cp = line
                    var_name = line_cp.strip().split('.')[-1]
                    new_line = ' ' * indent + "#pragma HLS DEPENDENCE variable=" + \
                        var_name + " inter false\n"
                lines.insert(prev_pos + 1, new_line)
                del lines[pos + 1]
        elif line.find("// hls_unroll") != -1:
            # Find the for loop above before hitting any "simd"
            prev_pos = pos - 1
            find_for = 0
            while prev_pos >= 0 and lines[prev_pos].find('simd') == -1:
                if lines[prev_pos].find('for') != -1:
                    find_for = 1
                    break
                prev_pos -= 1
            if find_for == 1:
                # insert the pragma right after the for loop
                indent = lines[prev_pos].find('for')
                new_line = ' ' * indent + "#pragma HLS UNROLL\n"
                lines.insert(prev_pos + 1, new_line)
                del lines[pos + 1]
        pos = pos + 1

    return lines

def insert_catapult_pragmas(lines):
    """ Insert Catapult HLS pragmas for Catapult program

    Replace the comments of "// hls_unroll" with HLS pragmas    
    For "// hls unroll", find the next for loop right below the mark.
    Insert "#pragma unroll yes" before the for loop.    

    Parameters
    ----------
    lines:
        contains the codelines of the program
    """
    # Handle hls_dependence
    handle_dep_pragma = 1

    code_len = len(lines)
    pos = 0
    while pos < code_len:
        line = lines[pos]    
        if line.find("// hls_unroll") != -1:
            # Find the for loop below
            next_pos = pos + 1
            find_for = 0
            if lines[next_pos].find('for') != -1:                            
                # insert the pragma right before the for loop
                indent = lines[next_pos].find('for')
                new_line = ' ' * indent + "#pragma unroll yes\n"
                lines.insert(next_pos, new_line)
                del lines[pos]
        pos = pos + 1

    return lines

def float_to_int(matchobj):
    str_expr = matchobj.group(0)
    if float(str_expr) == int(float(str_expr)):
        return str(int(float(str_expr)))
    else:
        return str_expr


def index_simplify(matchobj):
    str_expr = matchobj.group(0)
    if str_expr == '[arb]' or str_expr == '[!arb]' or str_expr == '[index[n]':
        return str_expr
    if '++' in str_expr:
        return str_expr
    expr = sympy.sympify(str_expr[1: len(str_expr) - 1])
    """
    This will sometimes cause bugs due to the different semantics in C
    E.g., x = 9, (x+3)/4 != x/4+3/4.
    We could use cxxcode, but it will generate floating expressions which are
    expensive on FPGA.
    At present, we check if there is floor or ceil in the expression.
    If so, we abort and use the original expression. Otherwise, we replace it
    with the simplified one.
    """
    expr = sympy.simplify(expr)
    new_str_expr = sympy.printing.ccode(expr)
#  # We will try to replace floats with integers if values won't change
#  new_str_expr = re.sub('\d+\.\d+', float_to_int, new_str_expr)

    if 'floor' in new_str_expr or 'ceil' in new_str_expr or '.0' in new_str_expr:
        return str_expr
    else:
        return '[' + new_str_expr + ']'


def mod_simplify(matchobj):
    str_expr = matchobj.group(0)
    str_expr = str_expr[1: len(str_expr) - 3]
    expr = sympy.sympify(str_expr)
    expr = sympy.simplify(expr)
    str_expr = str(expr)

    return '(' + str_expr + ') %'


def simplify_expressions(lines):
    """ Simplify the index expressions in the program

    Use Sympy to simplify all the array index expressions in the program.

    Parameters
    ----------
    lines:
        contains the codelines of the program
    """
    code_len = len(lines)
    # Simplify array index expressions
    for pos in range(code_len):
        line = lines[pos]
        line = re.sub(r'\[(.+?)\]', index_simplify, line)
        lines[pos] = line

    # Simplify mod expressions
    for pos in range(code_len):
        line = lines[pos]
        line = re.sub(r'\((.+?)\) %', mod_simplify, line)
        lines[pos] = line

    return lines

def shrink_bit_width(lines, target):
    """ Calculate the bitwidth of the iterator and shrink it to the proper size

    We will examine the for loops. Examine the upper bound of the loop. If the
    upper bound is a number, we will compute the bitwidth of the iterator.
    For Intel target, we will also look for iterator definitions marked with
    "/* UB: [...] */". The shallow bitwidth is calculated and replace the previous
    data type.

    Parameters
    ----------
    lines:
        contains the codelines of the program
    target:
        xilinx|intel
    """
    code_len = len(lines)
    for pos in range(code_len):
        line = lines[pos]
        if line.find('for') != -1:
            # Parse the loop upper bound
            m = re.search('<=(.+?);', line)
            if m:
                ub = m.group(1).strip()
                if ub.isnumeric():
                    # Replace it with shallow bit width
                    bitwidth = int(np.ceil(np.log2(float(ub) + 1))) + 1
                    if target == 'xilinx':
                        new_iter_t = 'ap_uint<' + str(bitwidth) + '>'
                    elif target == 'intel':
                        new_iter_t = 'uint' + str(bitwidth) + '_t'
                    elif target == 'catapult':
                        new_iter_t = 'ac_int<' + str(bitwidth) + ', false>'
                    line = re.sub('int', new_iter_t, line)
                    lines[pos] = line
            m = re.search('<(.+?);', line)
            if m:
                ub = m.group(1).strip()
                if ub.isnumeric():
                    #print(pos)
                    # Replace it with shallow bit width                    
                    bitwidth = int(np.ceil(np.log2(float(ub)))) + 1
                    if target == 'xilinx':
                        new_iter_t = 'ap_uint<' + str(bitwidth) + '>'
                    elif target == 'intel':
                        new_iter_t = 'uint' + str(bitwidth) + '_t'
                    elif target == 'catapult':
                        new_iter_t = 'ac_int<' + str(bitwidth) + ', false>'
                    line = re.sub('int', new_iter_t, line)
                    lines[pos] = line

    for pos in range(code_len):
        line = lines[pos]
        m = re.search(r'/\* UB: (.+?) \*/', line)
        if m:
            ub = m.group(1).strip()
            if ub.isnumeric():
                # Replace it with shallow bit width
                bitwidth = int(np.ceil(np.log2(float(ub) + 1))) + 1
                if target == 'xilinx':
                    new_iter_t = 'ap_uint<' + str(bitwidth) + '>'
                elif target == 'intel':
                    new_iter_t = 'uint' + str(bitwidth) + '_t'
                elif target == 'catapult':
                    new_iter_t = 'ac_int<' + str(bitwidth) + ', false>'
                #line = re.sub('int', new_iter_t, line)
                line = re.sub(
                    r'(int)' +
                    r'\s' +
                    r'([a-zA-Z])',
                    new_iter_t +
                    r' \g<2>',
                    line)
                lines[pos] = line

    return lines


def lift_split_buffers(lines):
    """ Lift the split buffers in the program

    For each module, if we find any split buffers with the name "data_split",
    we will lift them out of the for loops and put them in the variable declaration
    section at the beginning of the module.

    Parameters
    ----------
    lines:
        contains the codelines of the program
    """
    code_len = len(lines)
    for pos in range(code_len):
        line = lines[pos]
        if line.find('variable=data_split') != -1:
            # Search for the variable declaration section
            decl_pos = -1
            prev_pos = pos - 1
            while prev_pos >= 0:
                prev_line = lines[prev_pos]
                if prev_line.find('Variable Declaration') != -1:
                    decl_pos = prev_pos
                    break
                prev_pos -= 1
            # Move the two code lines at [pos - 1] and [pos] to [decl_pos] and
            # [decl_pos + 1]
            indent = lines[decl_pos].find('/*')
            line1 = ' ' * indent + lines[pos - 1].lstrip()
            line2 = ' ' * indent + lines[pos].lstrip()
            del lines[pos - 1]
            del lines[pos - 1]
            lines.insert(decl_pos, line1)
            lines.insert(decl_pos + 1, line2)

    return lines

def build_dummy_module_def(group_name, fifo_type, module_in, PE_ids):
    """ Build the definition of the dummy module

    Parameters
    ----------
    group_name: str
    fifo_type: str
    module_in: int
    PE_ids: list
    """
    dir_str = 'out' if module_in == 0 else 'in'
    index_str = ['idx', 'idy', 'idz']
    fifo_name = f'fifo_{group_name}_{dir_str}'

    lines = []
    lines.append('/* Module Definition */\n')
    lines.append(f'void {group_name}_PE_dummy_{dir_str}(')
    for pos in range(len(PE_ids)):
        lines.append(f'int {index_str[pos]}, ')
    lines.append(f'hls::stream<{fifo_type}> &{fifo_name}){{\n')
    if module_in == 0:
        lines.append(f'  if (!{fifo_name}.full())\n')
        lines.append(f'    {fifo_name}.write(0);\n')
    else:
        lines.append(f'  {fifo_type} fifo_data = {fifo_name}.read();\n')
    lines.append(f'}}\n')
    lines.append(f'/* Module Definition */\n')

    return lines

def build_dummy_module_call(group_name, fifo_name, module_in, PE_ids):
    """ Build the call of the dummy module

    Parameters
    ----------
    group_name: str
    fifo_name: str
    module_in: int
    PE_ids: list
    """
    dir_str = 'out' if module_in == 0 else 'in'

    lines = []
    lines.append('\n')
    lines.append('  /* Module Call */\n')
    lines.append(f'  {group_name}_PE_dummy_{dir_str}(\n')
    for id in PE_ids:
        lines.append(f'    /* module id */ {id},\n')
    lines.append(f'    /* fifo */ {fifo_name}\n')
    lines.append(f'  );\n')
    lines.append(f'  /* Module Call */\n')

    return lines

def insert_dummy_modules(def_lines, call_lines):
    """ Insert the missing dummy modules

    Collect the FIFO information of PEs (fifo_name, fifo_type).
    Delete those FIFOs that are connected to other modules.
    Insert dummy modules for the rest of FIFOs.

    Parameters
    ----------
    def_lines: list
        Contains the codelines of the module definitions
    call_lines: list
        Contains the codelines of the module calls
    """
    PE_fifos = []
    for line in def_lines:
        if line.find('void PE_wrapper') != -1:
            # Parse the argument list
            m = re.search(r'\((.+?)\)', line)
            args = m.group(1).strip().split(',')
            for arg in args:
                if arg.find('fifo') != -1:
                    m = re.search(r'stream<(.+?)>', arg)
                    fifo_type = m.group(1)
                    fifo_name = arg.split('&')[-1]
                    PE_fifos.append({'type': fifo_type, 'name': fifo_name})
    #print(PE_fifos)
    # Collect all used fifos
    used_fifos = {}
    kernel_start = 0
    for line in call_lines:
        if line.find('void kernel0') != -1:
            kernel_start = 1
        if kernel_start:
            if line.find('* fifo *') != -1:
                fifo = line.strip().split('*')[2][2:]
                if fifo[-1] == ',':
                    fifo = fifo[:-1]
                # Only process PE level fifos
                if fifo.find('PE') == -1:
                    continue
                if fifo not in used_fifos:
                    used_fifos[fifo] = -1
                else:
                    del used_fifos[fifo]
    #print(used_fifos)
    # Locate the fifo position
    inside_module = False
    inside_PE = False
    fifo_pos = 0
    PE_call_start = -1
    PE_call_end = -1
    line_id = 0
    for line in call_lines:
        if line.find('Module Call') != -1:
            inside_module = not inside_module
            if inside_PE:
                PE_call_end = line_id
            inside_PE = False
        if inside_module:
            if line.find('PE_wrapper') != -1:
                inside_PE = True
                fifo_pos = 0
                if PE_call_start == -1:
                    PE_call_start = line_id - 1
            if inside_PE:
                if line.find('fifo') != -1:
                    for used_fifo in used_fifos:
                        if line.find(used_fifo) != -1:
                            used_fifos[used_fifo] = fifo_pos
                    fifo_pos += 1
        line_id += 1
    #print(used_fifos)
    # Insert the dummy module definitions
    offset_line = 0
    for used_fifo in used_fifos:
        fifo_info = PE_fifos[used_fifos[used_fifo]]
        # Extract the module direction
        if fifo_info['name'].endswith('in'):
            module_in = 0
        else:
            module_in = 1
        # Extract the group name
        if fifo_info['name'].endswith('in'):
            group_name = fifo_info['name'][5:-3]
        else:
            group_name = fifo_info['name'][5:-4]
        # Extract the PE ids
        PE_ids = used_fifo[len(f'fifo_{group_name}_PE_'):].split('_')
        #print(used_fifo, module_in, group_name, PE_ids)

        # Build the dummy module definition
        module_def = build_dummy_module_def(group_name, fifo_info['type'], module_in, PE_ids)
        #print(module_def)
        def_lines += module_def
        def_lines.append('\n')

        # Build the dummy module call
        module_call = build_dummy_module_call(group_name, used_fifo, module_in, PE_ids) # TODO
        if module_in == 0:
            for i in range(len(module_call)):
                call_lines.insert(PE_call_start - 1 + i, module_call[i])
            offset_line += len(module_call)
        else:
            for i in range(len(module_call)):
                call_lines.insert(PE_call_end + 1 + offset_line + i, module_call[i])

    #print(PE_call_start, PE_call_end)

    return def_lines, call_lines

def modify_tb(lines):
    """ Modify the test bench for Catapult HLS
    
    Replace the int main with CCS_MAIN.

    Paramters
    ---------
    lines: list
        contains the codelines of the test bench
    """
    for pos in range(len(lines)):
        line = lines[pos]
        if line.find('int main') != -1:
            line = line.replace('int main', 'CCS_MAIN')
        lines[pos] = line
    return lines

def reorder_module_calls(lines, target):
    """ Reorder the module calls in the program

    For I/O module calls, we will reverse the sequence of calls for output modules.
    Starting from the first module, enlist the module calls until the boundary module
    is met.
    Reverse the list and print it.

    Parameters
    ----------
    lines: list
        contains the codelines of the program
    target: string
        xilinx|intel|catapult
    """

    code_len = len(lines)
    module_calls = []
    module_start = 0
    module_call = []
    output_io = 0
    boundary = 0
    new_module = 0
    prev_module_name = ""
    first_line = -1
    last_line = -1
    reset = 0

    for pos in range(code_len):
        line = lines[pos]
        if line.find("/* Module Call */") != -1:
            if module_start == 0:
                module_start = 1
            else:
                module_start = 0

            if module_start:
                # Examine if the module is an output I/O module
                nxt_line = lines[pos + 1]
                if nxt_line.find("IO") != -1 and nxt_line.find("out") != -1:
                    output_io = 1
                    # Examine if the module is an boundary module
                    if nxt_line.find("boundary") != -1:
                        boundary = 1
                # Extract the module name
                nxt_line = nxt_line.strip()
                if nxt_line.find('<') != -1:
                    module_name = nxt_line.split('<')[0]
                else:
                    module_name = nxt_line.split('(')[0]
                if target == 'catapult':                    
                    module_name = module_name[:module_name.find('_inst')]

                if module_name.find('wrapper'):
                    module_name = module_name[:-8]
                if boundary:
                    module_name = module_name[:-9]
                if prev_module_name == "":
                    prev_module_name = module_name
                    first_line = pos
                else:
                    if prev_module_name != module_name:
                        new_module = 1
                        prev_module_name = module_name
                        first_line = pos
                        reset = 0
                    else:
                        if reset:
                            first_line = pos
                            reset = 0
                        new_module = 0

            if not module_start:
                if output_io:
                    last_line = pos
                    module_call.append(line)
                    module_calls.append(module_call.copy())
                    module_call.clear()
                    if boundary:
                        # Pop out the previous module calls except the last one
                        if new_module:
                            module_calls = module_calls[-1:]
                        # Reverse the list
                        module_calls.reverse()
                        # Insert it back
                        left_lines = lines[last_line + 1:]
                        lines = lines[:first_line]
                        first = 1
                        for c in module_calls:
                            if not first:
                                lines.append("\n")
                            lines = lines + c
                            first = 0
                        lines = lines + left_lines
                        # Clean up
                        module_calls.clear()
                        boundary = 0
                        output_io = 0
                        reset = 1
                    if new_module:
                        # Pop out the previous module calls except the last one
                        module_calls = module_calls[-1:]


        if module_start and output_io:
            module_call.append(line)

    return lines

def xilinx_run(
        kernel_call,
        kernel_def,
        kernel='autosa.tmp/output/src/kernel_kernel.cpp',
        host='opencl',
        hcl=False):
    """ Generate the kernel file for Xilinx platform

    We will copy the content of kernel definitions before the kernel calls.

    Parameters
    ----------
    kernel_call:
        file containing kernel calls
    kernel_def:
        file containing kernel definitions
    kernel:
        output kernel file
    hcl:
        integrated with HeteroCL
    """

    # Load kernel definition file
    lines = []
    with open(kernel_def, 'r') as f:
        lines = f.readlines()
    call_lines = []
    with open(kernel_call, 'r') as f:
        call_lines = f.readlines()

    # Simplify the expressions
    lines = simplify_expressions(lines)

    # Change the loop iterator type
    lines = shrink_bit_width(lines, 'xilinx')

    # Insert the HLS pragmas
    lines = insert_xlnx_pragmas(lines)

    # Lift the split_buffers
    lines = lift_split_buffers(lines)

    ## Insert missing dummy modules
    #lines, call_lines = insert_dummy_modules(lines, call_lines)

    kernel = str(kernel)
    print("Please find the generated file: " + kernel)

    with open(kernel, 'w') as f:
        if host == 'opencl' or hcl == True:
            # Merge kernel header file
            kernel_header = kernel.split('.')
            kernel_header[-1] = 'h'
            kernel_header = ".".join(kernel_header)
            with open(kernel_header, 'r') as f2:
                header_lines = f2.readlines()
                f.writelines(header_lines)
            f.write('\n')

        f.writelines(lines)

        # Reorder module calls
        call_lines = reorder_module_calls(call_lines, 'xilinx')
        f.writelines(call_lines)

        ## Load kernel call file
        #with open(kernel_call, 'r') as f2:
        #    lines = f2.readlines()
        #    # Reorder module calls
        #    lines = reorder_module_calls(lines)
        #    f.writelines(lines)


def catapult_run(
        kernel_call,
        kernel_def,
        tb,
        kernel='autosa.tmp/output/src/kernel_kernel_hw.h',
        host='opencl'):
    """ Generate the kernel file for Catapult HLS platform

    We will copy the content of kernel definitions before the kernel calls.

    Parameters
    ----------
    kernel_call:
        file containing kernel calls
    kernel_def:
        file containing kernel definitions
    tb: 
        file containing test bench
    kernel:
        output kernel file
    """

    # Load kernel definition file
    lines = []
    with open(kernel_def, 'r') as f:
        lines = f.readlines()
    call_lines = []
    with open(kernel_call, 'r') as f:
        call_lines = f.readlines()

    # Simplify the expressions
    lines = simplify_expressions(lines)

    # Change the loop iterator type
    lines = shrink_bit_width(lines, 'catapult')

    # Insert the HLS pragmas
    lines = insert_catapult_pragmas(lines)

    # Lift the split_buffers
    lines = lift_split_buffers(lines)    

    kernel = str(kernel)
    print("Please find the generated file: " + kernel)

    with open(kernel, 'w') as f:
        #if host == 'opencl':
        #    # Merge kernel header file
        #    kernel_header = kernel.split('.')
        #    kernel_header[-1] = 'h'
        #    kernel_header = ".".join(kernel_header)
        #    with open(kernel_header, 'r') as f2:
        #        header_lines = f2.readlines()
        #        f.writelines(header_lines)
        #    f.write('\n')

        f.writelines(lines)

        # Reorder module calls
        call_lines = reorder_module_calls(call_lines, 'catapult')

        f.writelines(call_lines)      

     # Modify the test bench
    with open(tb, 'r') as f:
        tb_lines = f.readlines()    
    tb_lines = modify_tb(tb_lines)
    with open(tb, 'w') as f:
        f.writelines(tb_lines)

def insert_intel_pragmas(lines):
    """ Insert Intel OpenCL pragmas for Intel program

    Replace the comments of "// hls_unroll" with OpenCL pragmas.
    For "hls unroll", find the previous for loop before hitting the "simd" mark.
    Insert "#pragma unroll" above the for loop.
    Replace the comments of "// hls_coalesce" with OpenCL pragma "#pragma loop_coalesce".

    Parameters
    ----------
    lines:
        contains the codelines of the program
    """
    code_len = len(lines)
    pos = 0
    while pos < code_len:
        line = lines[pos]
        if line.find('// hls_unroll') != -1:
            # Find the for loop above before hitting any "simd"
            prev_pos = pos - 1
            find_for = 0
            while prev_pos >= 0 and lines[prev_pos].find('simd') == -1:
                if lines[prev_pos].find('for') != -1:
                    find_for = 1
                    break
                prev_pos -= 1
            if find_for == 1:
                # Insert the pragma right before the for loop
                indent = lines[prev_pos].find('for')
                new_line = ' ' * indent + "#pragma unroll\n"
                lines.insert(prev_pos, new_line)
                del lines[pos + 1]
#    if line.find('// hls_coalesce') != -1:
#      indent = line.find('// hls_coalesce')
#      new_line = ' ' * indent + "#pragma loop_coalesce\n"
#      del lines[pos]
#      lines.insert(pos, new_line)
        pos = pos + 1

    return lines


def intel_run(
        kernel_call,
        kernel_def,
        kernel='autosa.tmp/output/src/kernel_kernel.cpp',
        hcl=False):
    """ Generate the kernel file for Intel platform

    We will extract all the fifo declarations and module calls.
    Then plug in the module definitions into each module call.

    Parameters
    ----------
    kernel_call:
        file containing kernel calls
    kernel_def:
        file containing kernel definitions
    kernel:
        output kernel file
    hcl:
        integrated with HeteroCL
    """
    # Load kernel call file
    module_calls = []
    fifo_decls = []
    with open(kernel_call, 'r') as f:
        add = False
        while True:
            line = f.readline()
            if not line:
                break
            # Extract the fifo declaration and add to the list
            if add:
                line = line.strip()
                fifo_decls.append(line)
            if line.find('/* FIFO Declaration */') != -1:
                if add:
                    fifo_decls.pop(len(fifo_decls) - 1)
                add = not add

    with open(kernel_call, 'r') as f:
        add = False
        module_call = []
        while True:
            line = f.readline()
            if not line:
                break
            # Extract the module call and add to the list
            if add:
                line = line.strip()
                module_call.append(line)
            if line.find('/* Module Call */') != -1:
                if add:
                    module_call.pop(len(module_call) - 1)
                    module_calls.append(module_call.copy())
                    module_call.clear()
                add = not add

    module_defs = {}
    headers = []
    #print(hcl)
    with open(kernel_def, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            if line.find('#include') != -1:
                #line = line.strip()
                if hcl == True and line.find('_kernel.h') != -1:
                    # Replace the header include with header contents
                    #print(line)
                    file_name = re.search(r'include \"(.+?)\"', line).group(1)
                    file_path = os.path.dirname(kernel) + '/' + file_name                    
                    with open(file_path, 'r') as f2:
                        header_lines = f2.readlines()
                        headers += header_lines
                else:
                    headers.append(line)

    with open(kernel_def, 'r') as f:
        add = False
        module_def = []
        while True:
            line = f.readline()
            if not line:
                break
            # Extract the module definition and add to the dict
            if add:
                module_def.append(line)
                # Extract the module name
                if (line.find('void')) != -1:
                    m = re.search(r'void (.+?)\(', line)
                    if m:
                        module_name = m.group(1)
                        #print(module_name)
            if line.find('/* Module Definition */') != -1:
                if add:
                    module_def.pop(len(module_def) - 1)
                    module_defs[module_name] = module_def.copy()
                    module_def.clear()
                    # Post-process the module definition
                    # Simplify the expressions
                    module_defs[module_name] = simplify_expressions(
                        module_defs[module_name])
                    # Insert the OpenCL pragmas
                    module_defs[module_name] = insert_intel_pragmas(
                        module_defs[module_name])
                    # Change the loop iterator type
                    module_defs[module_name] = shrink_bit_width(
                        module_defs[module_name], 'intel')
                add = not add

    # compose the kernel file
    kernel = str(kernel)
    generate_intel_kernel(
        kernel,
        headers,
        module_defs,
        module_calls,
        fifo_decls)


def tapa_run(
        kernel_call,
        kernel_def,
        kernel='autosa.tmp/output/src/kernel_kernel.cpp'):
    """ Generate the kernel file for TAPA platform

    We will copy the content of kernel definitions before the kernel calls.

    Parameters
    ----------
    kernel_call:
        file containing kernel calls
    kernel_def:
        file containing kernel definitions
    """

    # Load kernel definition file
    lines = []
    with open(kernel_def, 'r') as f:
        lines = f.readlines()
    call_lines = []
    with open(kernel_call, 'r') as f:
        call_lines = f.readlines()

    # Simplify the expressions
    lines = simplify_expressions(lines)

    # Change the loop iterator type
    lines = shrink_bit_width(lines, 'xilinx')

    # Insert the HLS pragmas
    lines = insert_xlnx_pragmas(lines)

    # Lift the split_buffers
    lines = lift_split_buffers(lines)

    kernel = str(kernel)
    print("Please find the generated file: " + kernel)

    with open(kernel, 'w') as f:
        f.writelines(lines)
        f.writelines(call_lines)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='==== AutoSA CodeGen ====')
    parser.add_argument(
        '-c',
        '--kernel-call',
        metavar='KERNEL_CALL',
        required=True,
        help='kernel function call')
    parser.add_argument(
        '-d',
        '--kernel-def',
        metavar='KERNEL_DEF',
        required=True,
        help='kernel function definition')
    parser.add_argument(
        '--tb',
        metavar='TB',
        required=False,
        help='test bench')    
    parser.add_argument(
        '-t',
        '--target',
        metavar='TARGET',
        required=True,
        help='hardware target: autosa_hls_c|autosa_opencl|autosa_catapult_c')
    parser.add_argument(
        '-o',
        '--output',
        metavar='OUTPUT',
        required=False,
        help='output kernel file')
    parser.add_argument(
        '--host',
        metavar='HOST',
        required=False,
        help='Xilinx host target: hls|opencl',
        default='opencl')
    parser.add_argument(
        '--hcl',        
        action='store_true',
        default=False,
        help='HeteroCL integration')

    args = parser.parse_args()

    if args.target == 'autosa_opencl':
        intel_run(args.kernel_call, args.kernel_def, args.output, args.hcl)
    elif args.target == 'autosa_hls_c':
        xilinx_run(args.kernel_call, args.kernel_def, args.output, args.host, args.hcl)
    elif args.target == 'autosa_tapa':
        tapa_run(args.kernel_call, args.kernel_def, args.output)
    elif args.target == 'autosa_catapult_c':
        catapult_run(args.kernel_call, args.kernel_def, args.tb, args.output, args.host)


================================================
FILE: autosa_scripts/hls_scripts/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design 
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_scripts/hls_scripts/hls_script_synth.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
#csim_design
csynth_design
#cosim_design 
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_scripts/intel_opencl_scripts/Makefile
================================================
APP ?= kernel
AOCL_BOARD ?= s10mx_hbm_es
SW_EMU_AOCX ?= $(APP)_sw_emu.aocx
HW_EMU_AOCX ?= $(APP)_hw_emu.aocx
HW_AOCX ?= $(APP)_hw.aocx
AOCO ?= $(APP).aoco
AOCR ?= $(APP).aocr

# Compiler
AOC ?= aoc
CXX ?= g++
AOC_FLAGS ?= -board=$(AOCL_BOARD) -fp-relaxed -report -hyper-optimized-handshaking=off -I $(INTELFPGAOCLSDKROOT)/include/kernel_headers

TARGET ?= host
SW_EMU_TARGET ?= host_sw_emu
TARGET_DIR ?= bin
AOCL_UTILS ?= $(INTELFPGAOCLSDKROOT)/examples_aoc/common

# Directories
INC_DIRS := src $(AOCL_UTILS)/inc
LIB_DIRS := 

# Files
INCS := $(wildcard src/*.h)
HOST_SRCS := $(wildcard src/$(APP)_host.cpp $(AOCL_UTILS)/src/AOCLUtils/*.cpp)
KERNEL_SRCS := src/$(APP)_kernel.cl

ifeq ($(VERBOSE),1)
ECHO := 
else
ECHO := @
endif

# Where is the Intel(R) FPGA SDK for OpenCL(TM) software?
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation)
endif
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)/host/include/CL/opencl.h),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation.)
endif

# OpenCL compile and link flags.
AOCL_COMPILE_CONFIG := $(shell aocl compile-config )
AOCL_LINK_LIBS := $(shell aocl ldlibs )
AOCL_LINK_FLAGS := $(shell aocl ldflags )
# Linking with defences enabled
AOCL_LINK_FLAGS += -z noexecstack
AOCL_LINK_FLAGS += -Wl,-z,relro,-z,now
AOCL_LINK_FLAGS += -Wl,-Bsymbolic
AOCL_LINK_FLAGS += -pie
AOCL_LINK_CONFIG := $(AOCL_LINK_FLAGS) $(AOCL_LINK_LIBS)

# Compilation flags
ifeq ($(DEBUG),1)
CXXFLAGS += -g
else
CXXFLAGS += -O2
endif
CXXFLAGS += -std=gnu++0x

# Compiling with defences enabled
CXXFLAGS += -fstack-protector
CXXFLAGS += -D_FORTIFY_SOURCE=2
CXXFLAGS += -Wformat -Wformat-security
CXXFLAGS += -fPIE

# We must force GCC to never assume that it can shove in its own
# sse2/sse3 versions of strlen and strcmp because they will CRASH.
# Very hard to debug!
CXXFLAGS += -fPIC

LIBS := rt pthread

## Make it all!
#all : $(TARGET_DIR)/$(TARGET)

sw_emu : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)

hls: $(TARGET_DIR)/$(AOCR)

hw : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)

hw_emu: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)

hw_emu_check: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)
	CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(HW_EMU_AOCX)

sw_emu_check : $(TARGET_DIR)/$(SW_EMU_TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)
	CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(SW_EMU_AOCX)

hw_check : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)
	$(TARGET_DIR)/$(TARGET) $(HW_AOCX)

# Host executable target.
$(TARGET_DIR)/$(TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET)

$(TARGET_DIR)/$(SW_EMU_TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET) -DEMULATE

$(TARGET_DIR) :
	$(ECHO)mkdir $(TARGET_DIR)

$(TARGET_DIR)/$(SW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=emulator -legacy-emulator -o $@ $^

$(TARGET_DIR)/$(HW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=simulator -ghdl -o $@ $^

$(TARGET_DIR)/$(HW_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -o $@ $^

$(TARGET_DIR)/$(AOCO) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -c -o $@ $^

$(TARGET_DIR)/$(AOCR) : $(TARGET_DIR)/$(AOCO)
	$(AOC) $(AOC_FLAGS) -rtl -o $@ $^

# Standard make targets
clean :
	$(ECHO)rm -rf $(TARGET_DIR)/*

.PHONY : all clean


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/inc/AOCLUtils/aocl_utils.h
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

// Main include file for AOCLUtils. Includes all other utility header files.

#ifndef AOCL_UTILS_H
#define AOCL_UTILS_H

#include "AOCLUtils/opencl.h"
#include "AOCLUtils/scoped_ptrs.h"
#include "AOCLUtils/options.h"

#endif


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/inc/AOCLUtils/opencl.h
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

// OpenCL utility functions.

#ifndef AOCL_UTILS_OPENCL_H
#define AOCL_UTILS_OPENCL_H

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>

#include "CL/opencl.h"

// This is assumed to be externally provided by the application.
extern void cleanup();

namespace aocl_utils {

// Host allocation functions
void *alignedMalloc(size_t size);
void alignedFree(void *ptr);

// Error functions
void printError(cl_int error);
void _checkError(int line,
                 const char *file,
                 cl_int error,
                 const char *msg,
                 ...); // does not return
#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__)

// Sets the current working directory to the same directory that contains
// this executable. Returns true on success.
bool setCwdToExeDir();

// Find a platform that contains the search string in its name (case-insensitive match).
// Returns NULL if no match is found.
cl_platform_id findPlatform(const char *platform_name_search);

// Returns the name of the platform.
std::string getPlatformName(cl_platform_id pid);

// Returns the name of the device.
std::string getDeviceName(cl_device_id did);

// Returns an array of device ids for the given platform and the
// device type.
// Return value must be freed with delete[].
cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices);

// Create a OpenCL program from a binary file.
// The program is created for all given devices associated with the context. The same
// binary is used for all devices.
cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices);

// Load binary file.
// Return value must be freed with delete[].
unsigned char *loadBinaryFile(const char *file_name, size_t *size);

// Checks if a file exists.
bool fileExists(const char *file_name);

// Returns the path to the AOCX file to use for the given device.
// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM).
// It uses the device name to get the board name and then looks for a
// corresponding AOCX file. Specifically, it gets the device name and
// extracts the board name assuming the device name has the following format:
//  <board> : ...
//
// Then the AOCX file is <prefix>_<version>_<board>.aocx. If this
// file does not exist, then the file name defaults to <prefix>.aocx.
std::string getBoardBinaryFile(const char *prefix, cl_device_id device);

// Returns the time from a high-resolution timer in seconds. This value
// can be used with a value returned previously to measure a high-resolution
// time difference.
double getCurrentTimestamp();

// Returns the difference between the CL_PROFILING_COMMAND_END and
// CL_PROFILING_COMMAND_START values of a cl_event object.
// This requires that the command queue associated with the event be created
// with the CL_QUEUE_PROFILING_ENABLE property.
//
// The return value is in nanoseconds.
cl_ulong getStartEndTime(cl_event event);

// Returns the maximum time span for the given set of events.
// The time span starts at the earliest event start time.
// The time span ends at the latest event end time.
cl_ulong getStartEndTime(cl_event *events, unsigned num_events);

// Wait for the specified number of milliseconds.
void waitMilliseconds(unsigned ms);

// OpenCL context callback function that simply prints the error information
// to stdout (via printf).
void oclContextCallback(const char *errinfo, const void *, size_t, void *);

} // ns aocl_utils

#endif


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/inc/AOCLUtils/options.h
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

// Declares a utility class used to parse command-line options.

#ifndef AOCL_UTILS_OPTIONS_H
#define AOCL_UTILS_OPTIONS_H

#include <map>
#include <sstream>
#include <string>
#include <vector>

namespace aocl_utils {

class Options {
public:
  typedef std::vector<std::string> StringVec;

  Options();
  Options(int num, char *argv[]);

  bool has(const std::string &name) const;
  std::string &get(const std::string &name); // will create an empty option if it does not exist
  const std::string &get(const std::string &name) const; // error if option does not exist

  void set(const std::string &name, const std::string &value) { get(name) = value; }

  // Command line options must be of the following form:
  //  [-]-name (indicates option exists)
  //  [-]-name=value
  //
  // This function assumes that the values are from main(int, char *).
  // This means that the argv[0] is skipped.
  void addFromCommandLine(int num, char *argv[]);

  // This templated function converts the option value to the given type.
  // An assert is raised if the conversion fails.
  template<typename T>
  T get(const std::string &name) const;

  template<typename T>
  void set(const std::string &name, const T &value);

  // Non-options are arguments processed in addFromCommandLine
  // that were not recognized as options.
  const StringVec &getNonOptions() const { return m_nonoptions; }
  size_t getNonOptionCount() const { return m_nonoptions.size(); }
  const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; }

private:
  typedef std::map<std::string, std::string> OptionMap;

  // Displays an error message indicating that a nameless option
  // was provided.
  void errorNameless() const;

  // Displays an error message indicating that the given option
  // has the wrong type and then exits with an error code.
  void errorWrongType(const std::string &name) const;

  // Displays an error message indicating that the given option
  // does not exist and then exits with an error code.
  void errorNonExistent(const std::string &name) const;

  OptionMap m_options;
  StringVec m_nonoptions;

  Options(const Options &); // not implemented
  void operator =(const Options &); // not implemented
};

template<typename T>
T Options::get(const std::string &name) const {
  std::stringstream ss;
  ss << get(name);

  T v;
  ss >> v;
  if(ss.fail() || !ss.eof()) {
    // Failed to parse or did not consume the whole string value.
    errorWrongType(name);
  }
  return v;
}

// Specialization for bool. 
template<>
inline bool Options::get<bool>(const std::string &name) const {
  if(has(name)) {
    const std::string &v = get(name);
    if(v == "1") {
      return true;
    }
  }
  return false;
}

// Specialization for std::string. Simply returns the option string.
// Requires specialization because using stringstream to read the string
// will stop at the first whitespace character (which is wrong).
template<>
inline std::string Options::get<std::string>(const std::string &name) const {
  return get(name);
}

// This assumes the type T can be serialized to a string and back (when get
// is called).
template<typename T>
void Options::set(const std::string &name, const T &value) {
  std::stringstream ss;
  ss << value;
  set(name, ss.str());
}

} // ns aocl_utils

#endif


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/inc/AOCLUtils/scoped_ptrs.h
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

// Scoped pointer definitions.

#ifndef AOCL_UTILS_SCOPED_PTRS_H
#define AOCL_UTILS_SCOPED_PTRS_H

namespace aocl_utils {

// Interface is essentially the combination of std::auto_ptr and boost's smart pointers,
// along with some small extensions (auto conversion to T*).

// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete
template<typename T>
class scoped_ptr {
public:
  typedef scoped_ptr<T> this_type;

  scoped_ptr() : m_ptr(NULL) {}
  scoped_ptr(T *ptr) : m_ptr(ptr) {}
  ~scoped_ptr() { reset(); }

  T *get() const { return m_ptr; }
  operator T *() const { return m_ptr; }
  T *operator ->() const { return m_ptr; }
  T &operator *() const { return *m_ptr; }

  this_type &operator =(T *ptr) { reset(ptr); return *this; }

  void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; }
  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }

private:
  T *m_ptr;

  // noncopyable
  scoped_ptr(const this_type &);
  this_type &operator =(const this_type &);
};

// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[]
// Also supports allocation/reset with a number, which is the number of
// elements of type T.
template<typename T>
class scoped_array {
public:
  typedef scoped_array<T> this_type;

  scoped_array() : m_ptr(NULL) {}
  scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); }
  explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); }
  ~scoped_array() { reset(); }

  T *get() const { return m_ptr; }
  operator T *() const { return m_ptr; }
  T *operator ->() const { return m_ptr; }
  T &operator *() const { return *m_ptr; }
  T &operator [](int index) const { return m_ptr[index]; }

  this_type &operator =(T *ptr) { reset(ptr); return *this; }

  void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; }
  void reset(size_t n) { reset(new T[n]); }
  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }

private:
  T *m_ptr;

  // noncopyable
  scoped_array(const this_type &);
  this_type &operator =(const this_type &);
};

// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree
// Also supports allocation/reset with a number, which is the number of
// elements of type T
template<typename T>
class scoped_aligned_ptr {
public:
  typedef scoped_aligned_ptr<T> this_type;

  scoped_aligned_ptr() : m_ptr(NULL) {}
  scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
  explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); }
  ~scoped_aligned_ptr() { reset(); }

  T *get() const { return m_ptr; }
  operator T *() const { return m_ptr; }
  T *operator ->() const { return m_ptr; }
  T &operator *() const { return *m_ptr; }
  T &operator [](int index) const { return m_ptr[index]; }

  this_type &operator =(T *ptr) { reset(ptr); return *this; }

  void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; }
  void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); }
  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }

private:
  T *m_ptr;

  // noncopyable
  scoped_aligned_ptr(const this_type &);
  this_type &operator =(const this_type &);
};

#if USE_SVM_API == 1
// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree
// Also supports allocation/reset with a number, which is the number of
// elements of type T
template<typename T>
class scoped_SVM_aligned_ptr {
public:
	typedef scoped_SVM_aligned_ptr<T> this_type;

	scoped_SVM_aligned_ptr() : m_ptr(NULL) {}
	scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
	explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); }
	~scoped_SVM_aligned_ptr() { reset(); }

	T *get() const { return m_ptr; }
	operator T *() const { return m_ptr; }
	T *operator ->() const { return m_ptr; }
	T &operator *() const { return *m_ptr; }
	T &operator [](int index) const { return m_ptr[index]; }

	this_type &operator =(T *ptr) { reset(ptr); return *this; }

	void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; }
	void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; }
	T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }

private:
	T *m_ptr;
	cl_context m_ctx;

	// noncopyable
	scoped_SVM_aligned_ptr(const this_type &);
	this_type &operator =(const this_type &);
};
#endif /* USE_SVM_API == 1 */

} // ns aocl_utils

#endif


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/readme.css
================================================
/*
Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

This agreement shall be governed in all respects by the laws of the State of California and
by the laws of the United States of America.
*/

body {
  margin: 0 1em 1em 1em;
  font-family: sans-serif;
}
ul {
  list-style-type: square;
}
pre, code, kbd, samp, tt {
  font-family: monospace, sans-serif;
  font-size: 1em;
}

h1 {
  font-size: 200%;
  color: #fff;
  background-color: #0067a6;
  margin: 0 -0.5em;
  padding: 0.25em 0.5em;
}
h1 .preheading {
  font-size: 40%;
  font-weight: normal;
}
h2 {
  font-size: 125%;
  background-color: #bae5ff;
  margin: 1.5em -0.8em 0 -0.8em;
  padding: 0.2em 0.8em;
}
h3 {
  margin-top: 1.5em;
  font-size: 100%;
  border-bottom: 1px dotted #000;
}

table {
  border: 2px solid #0067a6;
  border-collapse: collapse;
}
th {
  border-bottom: 1px solid #0067a6;
  border-left: 1px dotted #0067a6;
  border-right: 1px dotted #0067a6;
  background-color: #bae5ff;
  padding: 0.3em;
  font-size: 90%;
}
td {
  padding: 0.3em;
  border: 1px dotted #0067a6;
}

table.reqs {
  margin: 0 auto;
}
table.reqs td {
  white-space: nowrap;
  text-align: center;
}
table.reqs td:first-child,
table.reqs tr:first-child th:first-child {
  text-align: left;
}
table.reqs td.req {
  background-color: #b3ef71;
  font-size: 150%;
  padding: 0 0.3em;
}
table.reqs td.req .either {
  font-size: 50%;
}
table.reqs td.unsupported {
  white-space: normal;
  background-color: #ccc;
  max-width: 20em;
}
table.reqs a.note {
  text-decoration: none;
}
ol.req-notes > li {
  margin-bottom: 0.75em;
}

table.history {
  margin: 0 auto;
}
table.history td {
  text-align: center;
  vertical-align: top;
}
table.history .changes {
  text-align: left;
}
table.history tbody tr:first-child td {
  background-color: #b3ef71;
}
table.history ul {
  margin: 0;
  padding-left: 1em;
}

table.pkg-contents {
  margin: 0 auto;
}
table.pkg-contents th,
table.pkg-contents td {
  text-align: left;
  vertical-align: top;
}
table.pkg-contents td.path {
  font-family: monospace, sans-serif;
  font-size: 1em;
}
table.pkg-contents tr.highlight td {
  background-color: #ffc;
  font-weight: bold;
  color: #000;
}
table.pkg-contents td p:first-child {
  margin-top: 0;
}
table.pkg-contents td p:last-child {
  margin-bottom: 0;
}

table.parameters {
  margin-left: 3em;
  margin-right: 3em;
  font-family: monospace, sans-serif;
  font-size: 1em;
}
table.parameters th,
table.parameters td {
  font-family: sans-serif;
  text-align: center;
  vertical-align: top;
}
table.parameters .name,
table.parameters .desc {
  text-align: left;
}
table.parameters .name {
  white-space: nowrap;
}
table.parameters td.name,
table.parameters td.default {
  font-family: monospace, sans-serif;
  font-size: 1em;
}
table.parameters ul {
  margin-top: 0;
}
table.parameters td ul:last-child {
  margin-bottom: 0;
}

table.indent {
  margin-left: 3em;
}

.doc .title {
  background-color: #eee;
  padding: 0.35em;
  margin-bottom: 0.5em;
}
.doc .title a {
  font-weight: bold;
}
.doc .desc {
  margin-left: 2em;
  margin-right: 2em;
}

.left {
  text-align: left;
}
.center {
  text-align: center;
}
.right {
  text-align: right;
}

.mono {
  font-family: monospace, sans-serif;
  font-size: 1em;
}
.highlight {
  font-weight: bold;
  color: #0067a6;
}
.nowrap {
  white-space: nowrap;
}

.command {
  font-family: monospace, sans-serif;
  font-size: 1em;
  margin: 0 3em;
  background-color: #ffc;
  border: 1px solid #aaa;
  padding: 0.5em 1em;
}
.console-output,
.code-block {
  display: block;
  font-family: monospace, sans-serif;
  font-size: 1em;
  margin: 0 3em;
  background-color: #fff;
  border: 1px solid #aaa !important;
  padding: 1.8em 1em 0.5em 1em !important;
  position: relative;
}
.console-output .heading,
.code-block .heading {
  position: absolute;
  left: 0;
  top: 0;
  width: 100%;
  font-size: 80%;
  text-transform: uppercase;
  background-color: #e8e8e8;
  padding: 0.3125em 0;
  border-bottom: 1px dotted #888;
}
.console-output .heading span,
.code-block .heading span {
  padding: 0 1.25em;
}
.not-released {
  font-weight: bold;
  color: red;
}
.license,
.trademark {
  font-size: 80%;
}


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/src/AOCLUtils/opencl.cpp
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

#include "AOCLUtils/aocl_utils.h"
#include <algorithm>
#include <stdarg.h>

#ifdef _WIN32 // Windows
#include <windows.h>
#else         // Linux
#include <stdio.h> 
#include <unistd.h> // readlink, chdir
#endif

namespace aocl_utils {

static const char *const VERSION_STR = "191";

//////////////////////////////////////////
// Host allocation functions for alignment
//////////////////////////////////////////

// This is the minimum alignment requirement to ensure DMA can be used.
const unsigned AOCL_ALIGNMENT = 64;

#ifdef _WIN32 // Windows
void *alignedMalloc(size_t size) {
  return _aligned_malloc (size, AOCL_ALIGNMENT);
}

void alignedFree(void * ptr) {
  _aligned_free(ptr);
}
#else          // Linux
void *alignedMalloc(size_t size) {
  void *result = NULL;
  int rc;
  rc = posix_memalign (&result, AOCL_ALIGNMENT, size);
  return result;
}

void alignedFree(void * ptr) {
  free (ptr);
}
#endif

///////////////////////////////
// Error functions
///////////////////////////////

// Print the error associciated with an error code
void printError(cl_int error) {
  // Print error message
  switch(error)
  {
    case -1:
      printf("CL_DEVICE_NOT_FOUND ");
      break;
    case -2:
      printf("CL_DEVICE_NOT_AVAILABLE ");
      break;
    case -3:
      printf("CL_COMPILER_NOT_AVAILABLE ");
      break;
    case -4:
      printf("CL_MEM_OBJECT_ALLOCATION_FAILURE ");
      break;
    case -5:
      printf("CL_OUT_OF_RESOURCES ");
      break;
    case -6:
      printf("CL_OUT_OF_HOST_MEMORY ");
      break;
    case -7:
      printf("CL_PROFILING_INFO_NOT_AVAILABLE ");
      break;
    case -8:
      printf("CL_MEM_COPY_OVERLAP ");
      break;
    case -9:
      printf("CL_IMAGE_FORMAT_MISMATCH ");
      break;
    case -10:
      printf("CL_IMAGE_FORMAT_NOT_SUPPORTED ");
      break;
    case -11:
      printf("CL_BUILD_PROGRAM_FAILURE ");
      break;
    case -12:
      printf("CL_MAP_FAILURE ");
      break;
    case -13:
      printf("CL_MISALIGNED_SUB_BUFFER_OFFSET ");
      break;
    case -14:
      printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ");
      break;
    case -15:
      printf("CL_COMPILE_PROGRAM_FAILURE ");
      break;
    case -16:
      printf("CL_LINKER_NOT_AVAILABLE ");
      break;
    case -17:
      printf("CL_LINK_PROGRAM_FAILURE ");
      break;
    case -18:
      printf("CL_DEVICE_PARTITION_FAILED ");
      break;
    case -19:
      printf("CL_KERNEL_ARG_INFO_NOT_AVAILABLE ");
      break;

    case -30:
      printf("CL_INVALID_VALUE ");
      break;
    case -31:
      printf("CL_INVALID_DEVICE_TYPE ");
      break;
    case -32:
      printf("CL_INVALID_PLATFORM ");
      break;
    case -33:
      printf("CL_INVALID_DEVICE ");
      break;
    case -34:
      printf("CL_INVALID_CONTEXT ");
      break;
    case -35:
      printf("CL_INVALID_QUEUE_PROPERTIES ");
      break;
    case -36:
      printf("CL_INVALID_COMMAND_QUEUE ");
      break;
    case -37:
      printf("CL_INVALID_HOST_PTR ");
      break;
    case -38:
      printf("CL_INVALID_MEM_OBJECT ");
      break;
    case -39:
      printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ");
      break;
    case -40:
      printf("CL_INVALID_IMAGE_SIZE ");
      break;
    case -41:
      printf("CL_INVALID_SAMPLER ");
      break;
    case -42:
      printf("CL_INVALID_BINARY ");
      break;
    case -43:
      printf("CL_INVALID_BUILD_OPTIONS ");
      break;
    case -44:
      printf("CL_INVALID_PROGRAM ");
      break;
    case -45:
      printf("CL_INVALID_PROGRAM_EXECUTABLE ");
      break;
    case -46:
      printf("CL_INVALID_KERNEL_NAME ");
      break;
    case -47:
      printf("CL_INVALID_KERNEL_DEFINITION ");
      break;
    case -48:
      printf("CL_INVALID_KERNEL ");
      break;
    case -49:
      printf("CL_INVALID_ARG_INDEX ");
      break;
    case -50:
      printf("CL_INVALID_ARG_VALUE ");
      break;
    case -51:
      printf("CL_INVALID_ARG_SIZE ");
      break;
    case -52:
      printf("CL_INVALID_KERNEL_ARGS ");
      break;
    case -53:
      printf("CL_INVALID_WORK_DIMENSION ");
      break;
    case -54:
      printf("CL_INVALID_WORK_GROUP_SIZE ");
      break;
    case -55:
      printf("CL_INVALID_WORK_ITEM_SIZE ");
      break;
    case -56:
      printf("CL_INVALID_GLOBAL_OFFSET ");
      break;
    case -57:
      printf("CL_INVALID_EVENT_WAIT_LIST ");
      break;
    case -58:
      printf("CL_INVALID_EVENT ");
      break;
    case -59:
      printf("CL_INVALID_OPERATION ");
      break;
    case -60:
      printf("CL_INVALID_GL_OBJECT ");
      break;
    case -61:
      printf("CL_INVALID_BUFFER_SIZE ");
      break;
    case -62:
      printf("CL_INVALID_MIP_LEVEL ");
      break;
    case -63:
      printf("CL_INVALID_GLOBAL_WORK_SIZE ");
      break;
    case -64:
      printf("CL_INVALID_PROPERTY ");
      break;
    case -65:
      printf("CL_INVALID_IMAGE_DESCRIPTOR ");
      break;
    case -66:
      printf("CL_INVALID_COMPILER_OPTIONS ");
      break;
    case -67:
      printf("CL_INVALID_LINKER_OPTIONS ");
      break;
    case -68:
      printf("CL_INVALID_DEVICE_PARTITION_COUNT ");
      break;
    case -69:
      printf("CL_INVALID_PIPE_SIZE ");
      break;
    case -70:
      printf("CL_INVALID_DEVICE_QUEUE ");
      break;

    case -1001:
      printf("CL_PLATFORM_NOT_FOUND_KHR ");
      break;

    case -1094:
      printf("CL_INVALID_ACCELERATOR_INTEL ");
      break;
    case -1095:
      printf("CL_INVALID_ACCELERATOR_TYPE_INTEL ");
      break;
    case -1096:
      printf("CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL ");
      break;
    case -1097:
      printf("CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL ");
      break;
    default:
      printf("UNRECOGNIZED ERROR CODE (%d)", error);
  }
}

// Print line, file name, and error code if there is an error. Exits the
// application upon error.
void _checkError(int line,
                 const char *file,
                 cl_int error,
                 const char *msg,
                 ...) {
  // If not successful
  if(error != CL_SUCCESS) {
    // Print line and file
    printf("ERROR: ");
    printError(error);
    printf("\nLocation: %s:%d\n", file, line);

    // Print custom message.
    va_list vl;
    va_start(vl, msg);
    vprintf(msg, vl);
    printf("\n");
    va_end(vl);

    // Cleanup and bail.
    cleanup();
    exit(error);
  }
}

// Sets the current working directory to be the same as the directory
// containing the running executable.
bool setCwdToExeDir() {
#ifdef _WIN32 // Windows
  HMODULE hMod = GetModuleHandle(NULL);
  char path[MAX_PATH];
  GetModuleFileNameA(hMod, path, MAX_PATH);

#else         // Linux
  // Get path of executable.
  char path[300];
  ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1);
  if(n == -1) {
    return false;
  }
  path[n] = 0;
#endif

  // Find the last '\' or '/' and terminate the path there; it is now
  // the directory containing the executable.
  size_t i;
  for(i = strnlen(path, sizeof(path)) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i);
  path[i] = '\0';

  // Change the current directory.
#ifdef _WIN32 // Windows
  SetCurrentDirectoryA(path);
#else         // Linux
  int rc;
  rc = chdir(path);
#endif

  return true;
}

// Searches all platforms for the first platform whose name
// contains the search string (case-insensitive).
cl_platform_id findPlatform(const char *platform_name_search) {
  cl_int status;

  std::string search = platform_name_search;
  std::transform(search.begin(), search.end(), search.begin(), tolower);

  // Get number of platforms.
  cl_uint num_platforms;
  status = clGetPlatformIDs(0, NULL, &num_platforms);
  checkError(status, "Query for number of platforms failed");

  // Get a list of all platform ids.
  scoped_array<cl_platform_id> pids(num_platforms);
  status = clGetPlatformIDs(num_platforms, pids, NULL);
  checkError(status, "Query for all platform ids failed");

  // For each platform, get name and compare against the search string.
  for(unsigned i = 0; i < num_platforms; ++i) {
    std::string name = getPlatformName(pids[i]);

    // Convert to lower case.
    std::transform(name.begin(), name.end(), name.begin(), tolower);

    if(name.find(search) != std::string::npos) {
      // Found!
      return pids[i];
    }
  }

  // No platform found.
  return NULL;
}

// Returns the platform name.
std::string getPlatformName(cl_platform_id pid) {
  cl_int status;

  size_t sz;
  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
  checkError(status, "Query for platform name size failed");

  scoped_array<char> name(sz);
  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
  checkError(status, "Query for platform name failed");

  return name.get();
}

// Returns the device name.
std::string getDeviceName(cl_device_id did) {
  cl_int status;

  size_t sz;
  status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz);
  checkError(status, "Failed to get device name size");

  scoped_array<char> name(sz);
  status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL);
  checkError(status, "Failed to get device name");

  return name.get();
}

// Returns the list of all devices.
cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) {
  cl_int status;

  status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices);
  checkError(status, "Query for number of devices failed");

  cl_device_id *dids = new cl_device_id[*num_devices];
  status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL);
  checkError(status, "Query for device ids");

  // For Windows, clGetDeviceIDs() always gives num_devices = 128, so we have to find the actual number of available devices
  // See Release Notes here: https://www.intel.com/content/www/us/en/programmable/documentation/ewa1412772636144.html#ewa1412773000284
#ifdef _WIN32
  unsigned num_available = 0;
  cl_bool is_available;
  for (unsigned i = 0; i < *num_devices; i++) {
    status = clGetDeviceInfo(dids[i], CL_DEVICE_AVAILABLE, sizeof(is_available), &is_available, NULL);
    checkError(status, "Failed to get device availability");
    if (is_available != CL_TRUE)
      break;
    num_available++;
  }
  *num_devices = num_available;
#endif

  return dids;
}

// Create a program for all devices associated with the context.
cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) {
  // Early exit for potentially the most common way to fail: AOCX does not exist.
  if(!fileExists(binary_file_name)) {
    printf("AOCX file '%s' does not exist.\n", binary_file_name);
    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
  }

  // Load the binary.
  size_t binary_size;
  scoped_array<unsigned char> binary(loadBinaryFile(binary_file_name, &binary_size));
  if(binary == NULL) {
    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
  }

  scoped_array<size_t> binary_lengths(num_devices);
  scoped_array<unsigned char *> binaries(num_devices);
  for(unsigned i = 0; i < num_devices; ++i) {
    binary_lengths[i] = binary_size;
    binaries[i] = binary;
  }

  cl_int status;
  scoped_array<cl_int> binary_status(num_devices);

  cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths,
      (const unsigned char **) binaries.get(), binary_status, &status);
  checkError(status, "Failed to create program with binary");
  for(unsigned i = 0; i < num_devices; ++i) {
    checkError(binary_status[i], "Failed to load binary for device");
  }

  return program;
}

// Loads a file in binary form.
unsigned char *loadBinaryFile(const char *file_name, size_t *size) {
  // Open the File
  FILE* fp;
  long ftell_size;
  size_t elements_read;
#ifdef _WIN32
  if(fopen_s(&fp, file_name, "rb") != 0) {
    return NULL;
  }
#else
  fp = fopen(file_name, "rb");
  if(fp == 0) {
    return NULL;
  }
#endif

  // Get the size of the file
  fseek(fp, 0, SEEK_END);
  ftell_size = ftell(fp);
  if (ftell_size < 0) {
    fclose(fp);
    return NULL;
  }
  *size = (unsigned)ftell_size;

  // Allocate space for the binary
  unsigned char *binary = new unsigned char[*size];

  // Go back to the file start
  rewind(fp);

  // Read the file into the binary
  elements_read = fread((void*)binary, *size, 1, fp);
  if(elements_read == 0) {
    delete[] binary;
    fclose(fp);
    return NULL;
  }

  fclose(fp);
  return binary;
}

bool fileExists(const char *file_name) {
#ifdef _WIN32 // Windows
  DWORD attrib = GetFileAttributesA(file_name);
  return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY));
#else         // Linux
  return access(file_name, R_OK) != -1;
#endif
}

std::string getBoardBinaryFile(const char *prefix, cl_device_id device) {
  // First check if <prefix>.aocx exists. Use it if it does.
  std::string file_name = std::string(prefix) + ".aocx";
  if(fileExists(file_name.c_str())) {
    return file_name;
  }

  // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards,
  // the name of the device is presented as:
  //  <board name> : ...
  std::string device_name = getDeviceName(device);

  // Now search for the " :" in the device name.
  size_t end = device_name.find(" :");
  if(end != std::string::npos) {
    std::string board_name(device_name, 0, end);

    // Look for a AOCX with the name <prefix>_<board_name>_<version>.aocx.
    file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx";
    if(fileExists(file_name.c_str())) {
      return file_name;
    }
  }

  // At this point just use <prefix>.aocx. This file doesn't exist
  // and this should trigger an error later.
  return std::string(prefix) + ".aocx";
}

// High-resolution timer.
double getCurrentTimestamp() {
#ifdef _WIN32 // Windows
  // Use the high-resolution performance counter.

  static LARGE_INTEGER ticks_per_second = {};
  if(ticks_per_second.QuadPart == 0) {
    // First call - get the frequency.
    QueryPerformanceFrequency(&ticks_per_second);
  }

  LARGE_INTEGER counter;
  QueryPerformanceCounter(&counter);

  double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart);
  return seconds;
#else         // Linux
  timespec a;
  clock_gettime(CLOCK_MONOTONIC, &a);
  return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec);
#endif
}

cl_ulong getStartEndTime(cl_event event) {
  cl_int status;

  cl_ulong start, end;
  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
  checkError(status, "Failed to query event start time");
  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
  checkError(status, "Failed to query event end time");

  return end - start;
}

cl_ulong getStartEndTime(cl_event *events, unsigned num_events) {
  cl_int status;

  cl_ulong min_start = 0;
  cl_ulong max_end = 0;
  for(unsigned i = 0; i < num_events; ++i) {
    cl_ulong start, end;
    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
    checkError(status, "Failed to query event start time");
    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
    checkError(status, "Failed to query event end time");

    if(i == 0) {
      min_start = start;
      max_end = end;
    }
    else {
      if(start < min_start) {
        min_start = start;
      }
      if(end > max_end) {
        max_end = end;
      }
    }
  }

  return max_end - min_start;
}

void waitMilliseconds(unsigned ms) {
#ifdef _WIN32 // Windows
  Sleep(ms);
#else         // Linux
  timespec sleeptime = {0, 0};
  sleeptime.tv_sec = ms / 1000;
  sleeptime.tv_nsec = long(ms % 1000) * 1000000L;  // convert to nanoseconds
  nanosleep(&sleeptime, NULL);
#endif
}

void oclContextCallback(const char *errinfo, const void *, size_t, void *) {
  printf("Context callback: %s\n", errinfo);
}

} // ns aocl_utils


================================================
FILE: autosa_scripts/intel_opencl_scripts/common/src/AOCLUtils/options.cpp
================================================
// Copyright (C) 2013-2020 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
// 
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.

#include "AOCLUtils/aocl_utils.h"
#include <algorithm>
#include <iostream>
#include <stdlib.h>
#include <vector>

namespace aocl_utils {

Options::Options() {
}

Options::Options(int num, char *argv[]) {
  addFromCommandLine(num, argv);
}

bool Options::has(const std::string &name) const {
  return m_options.find(name) != m_options.end();
}

std::string &Options::get(const std::string &name) {
  return m_options[name];
}

const std::string &Options::get(const std::string &name) const {
  OptionMap::const_iterator it = m_options.find(name);
  if(it == m_options.end()) {
    errorNonExistent(name);
  }
  return it->second;
}

void Options::addFromCommandLine(int num, char *argv[]) {
  for(int i = 1; i < num; ++i) {
    const std::string arg = argv[i];

    // Look for the first '-'.
    if(arg.size() > 1 && arg[0] == '-') {
      size_t eq = arg.find('=');
      size_t name_start = 1;

      // Check if there's a second '-'.
      if(arg.size() > 2 && arg[1] == '-') {
        name_start = 2;
      }

      if(eq == std::string::npos) {
        // No '='; treat as a boolean option.
        set(arg.substr(name_start), true);
      }
      else if(eq == name_start) {
        // No name?!
        errorNameless();
      }
      else {
        set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1));
      }
    }
    else {
      // Not an option.
      m_nonoptions.push_back(arg);
    }
  }
}

void Options::errorNameless() const {
  std::cerr << "No name provided for option.\n";
  exit(1);
}

void Options::errorNonExistent(const std::string &name) const {
  std::cerr << "Option '" << name << "' does not exist.\n";
  exit(1);
}

void Options::errorWrongType(const std::string &name) const {
  std::cerr << "Value for option '" << name << "' is not of the right type (value = '"
            << get(name) << "').\n";
  exit(1);
}

} // ns aocl_utils


================================================
FILE: autosa_scripts/intel_opencl_scripts/compile_design.sh
================================================
#!/bin/bash

# - A script to compile and run the host program and bitstream on Intel OpenCL platform

if [ $# != 1 ];
then
  echo "Usage: compile_design.sh [hw|emu|sim]"
  exit
fi  
mode=$1
echo $mode

echo "Compiling the bitstream..."
if [ "$mode" == "hw" ]
then 
  # Compile the bitstream
  # Change the board to your target board if necessary
  aoc src/kernel_kernel.cl -o bin/kernel_kernel.aocx -fp-relaxed -board=s10mx_hbm_es
elif [ "$mode" == "emu" ]
then
  # Compiling for emulator
  aoc -march=emulator src/kernel_kernel.cl -o bin/kernel_kernel.aocx -fp-relaxed -DEMULATE -legacy-emulator
elif [ "$mode" == "sim" ]
then
  # Compiling for simulator
  aoc -march=simulator src/kernel_kernel.cl -o bin/kernel_kernel.aocx -fp-relaxed
else
  echo "Error: Unsupported mode"
  exit
fi

#echo "Compiling the host program..."
## Compile the host program
#make

#echo "Running the program..."
#case "$mode" in
#    "hw")
#      # Run the host program
#      bin/host
#      ;;
#    "emu")
#      # Run the host program with the emulator
#      bin/host -emulator
#      ;;
#    "sim")
#      # Run the host program with the simulator
#      CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 bin/host
#      ;;
#esac


================================================
FILE: autosa_scripts/latency_model.py
================================================
import os
import json
import re
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.stats.mstats import gmean
from statistics import mean
import shutil
import math
import argparse

def extract_latency_info(design_dir):
    """ Extract loop information of the design.

    Returns a dictionary containing the following infomation:
    - loop_infos: dict
    - module_list: list
    - array_info: dict
    - module_grouped: dict

    Parameters
    ----------
    design_dir: str
        The design directory
    """
    loop_path = f'{design_dir}/latency_est'
    loop_info_files = os.listdir(loop_path)
    loop_info_all = {}
    module_names = []

    for f_name in loop_info_files:
        if f_name == 'array_info.json':
            with open(loop_path + '/' + f_name) as f:
                array_info = json.load(f)
        else:
            with open(loop_path + '/' + f_name) as f:
                loop_info_module = json.load(f)
                module_name = loop_info_module['module_name']
                loop_info_all[module_name] = loop_info_module
                module_names.append(module_name)

    module_grouped = {}
    # Place inter_trans and intra_trans module under the outer module
    for module_name in module_names:
        # intra_trans
        if module_name.find('intra_trans') != -1:
            module_name_prefix = module_name[:-12]
            if module_name_prefix not in module_grouped:
                module_grouped[module_name_prefix] = {}
            module_grouped[module_name_prefix]['intra_trans'] = module_name

            module_name_prefix = module_name_prefix + '_boundary'
            if module_name_prefix not in module_grouped:
                module_grouped[module_name_prefix] = {}
            module_grouped[module_name_prefix]['intra_trans'] = module_name

        # inter_trans
        elif module_name.find('inter_trans') != -1:
            if module_name.find('boundary') != -1:
                module_name_prefix = module_name[:-21] + '_boundary'
            else:
                module_name_prefix = module_name[:-12]

            if module_name_prefix not in module_grouped:
                module_grouped[module_name_prefix] = {}
            module_grouped[module_name_prefix]['inter_trans'] = module_name
        else:
            if module_name not in module_grouped:
                module_grouped[module_name] = {}

    ret = {'loop_infos': loop_info_all, 'module_list': module_names, \
           'module_grouped': module_grouped, 'array_info': array_info}

    return ret

def convert_latency_infos_to_df(latency_infos):
    """ Convert the latency infos into a dataframe.

    """
    return

def is_loop_struct_leaf_empty(loop_struct):
    """ Examine if the leaf node of the loop struct is empty.

    Parameters
    ----------
    loop_struct: dict
        loop structure in JSON format
    """
    if "loop" in loop_struct:
        child = loop_struct['loop']['child']
        if child == None:
            return 1
        else:
            return is_loop_struct_leaf_empty(child)
    elif "mark" in loop_struct:
        child = loop_struct['mark']['child']
        if child == None:
            return 1
        else:
            return is_loop_struct_leaf_empty(child)
    elif "user" in loop_struct:
        child = loop_struct['user']['user_expr']
        if child == None:
            return 1
        else:
            return 0
    elif "block" in loop_struct:
        children = loop_struct['block']['child']
        if children == None:
            return 1
        else:
            for child in children:
                is_empty = is_loop_struct_leaf_empty(child)
                if is_empty == 0:
                    return 0
            return 1
    elif "if" in loop_struct:
        if_struct = loop_struct['if']
        then_block = if_struct['then']
        is_empty = is_loop_struct_leaf_empty(then_block)
        if is_empty == 0:
            return 0
        if 'else' in if_struct:
            else_block = if_struct['else']
            is_empty = is_loop_struct_leaf_empty(else_block)
            if is_empty == 0:
                return 0
            return 1
    return 1

def loop_struct_has_non_simd_loop(loop_struct, config):
    """ Examine if the leaf node of the loop struct has any non-SIMD loop.

    """
    if "loop" in loop_struct:
        if config['under_simd'] == 1:
            return 0
        else:
            return 1
    elif "mark" in loop_struct:
        mark = loop_struct['mark']
        mark_name = mark['mark_name']
        if mark_name == 'simd':
            config['under_simd'] = 1
        child = mark['child']
        if child == None:
            return 0
        else:
            return loop_struct_has_non_simd_loop(child, config)
    elif "user" in loop_struct:
        return 0
    elif "block" in loop_struct:
        children = loop_struct['block']['child']
        if children == None:
            return 0
        else:
            for child in children:
                has_non_simd_loop = loop_struct_has_non_simd_loop(child, config)
                if has_non_simd_loop == 1:
                    return 1
            return 0
    elif "if" in loop_struct:
        if_struct = loop_struct['if']
        then_block = if_struct['then']
        has_non_simd_loop = loop_struct_has_non_simd_loop(then_block, config)
        if has_non_simd_loop == 1:
            return 1
        if 'else' in if_struct:
            else_block = if_struct['else']
            has_non_simd_loop = loop_struct_has_non_simd_loop(else_block, config)
            if has_non_simd_loop == 1:
                return 1
        return 0

    return 0

def loop_struct_has_for_loop(loop_struct):
    """ Examine if the leaf node of the loop struct has any for loop.

    """
    if "loop" in loop_struct:
        return 1
    elif "mark" in loop_struct:
        child = loop_struct['mark']['child']
        if child == None:
            return 0
        else:
            return loop_struct_has_for_loop(child)
    elif "user" in loop_struct:
        child = loop_struct['user']['user_expr']
        return 0
    elif "block" in loop_struct:
        children = loop_struct['block']['child']
        if children == None:
            return 0
        else:
            for child in children:
                has_for_loop = loop_struct_has_for_loop(child)
                if has_for_loop == 1:
                    return 1
            return 0
    elif "if" in loop_struct:
        if_struct = loop_struct['if']
        then_block = if_struct['then']
        has_for_loop = loop_struct_has_for_loop(then_block)
        if has_for_loop == 1:
            return 1
        if 'else' in if_struct:
            else_block = if_struct['else']
            has_for_loop = loop_struct_has_for_loop(else_block)
            if has_for_loop == 1:
                return 1
        return 0

    return 0

def predict_module_latency_xilinx(loop_struct, config):
    """ Predict the module latency for Xilinx FPGAs.

    """
    latency = config['latency']
    if "loop" in loop_struct:
        config['under_loop'] = 1
        # Extract the loop information
        loop = loop_struct['loop']
        loop_info = loop['loop_info']
        lb = loop_info['lb']
        ub = loop_info['ub']
        iterator = loop_info['iter']
        # Check if lb/ub is real number
        if lb.isnumeric():
            lb_n = int(lb)
        else:
            lb_n = 0
            #raise NotImplementedError(f'Non-number loop lower bound ({lb}) is not supported.')
        if ub.isnumeric():
            ub_n = int(ub)
        else:
            raise NotImplementedError(f'Non-number loop upper bound ({ub}) is not supported.')
        config['context'][iterator] = {}
        config['context'][iterator]['lb'] = lb_n
        config['context'][iterator]['ub'] = ub_n
        if config['under_unroll'] == 0:
            latency = latency * (ub_n - lb_n + 1)
            config['latency'] = latency
        child = loop['child']
        # if it is an outer module, we will need to update loop_prefix at each loop level.
        if config['module_type'] == 1:
            if config['loop_prefix'] == 'Loop':
                config['loop_prefix'] = config['loop_prefix'] + str(config['loop_offset'])
            else:
                config['loop_prefix'] = config['loop_prefix'] + '.' + str(config['loop_offset'])
        # Store the current for loop
        config['last_for']['iter'] = iterator
        config['last_for']['lb'] = lb_n
        config['last_for']['ub'] = ub_n
        if config['under_coalesce'] == 1:
            config['last_for']['under_coalesce'] = 1
        else:
            config['last_for']['under_coalesce'] = 0
        predict_module_latency_xilinx(child, config)
    elif "mark" in loop_struct:
        mark = loop_struct['mark']
        mark_name = mark['mark_name']
        # If we meet the 'hls_unroll' mark, the loop below no longer counts in to the loop iteration.
        if mark_name == 'simd':
            config['under_unroll'] = 1
        if mark_name == 'access_coalesce':
            config['under_coalesce'] = 1
        if mark_name == 'access_serialize':
            config['under_serialize'] = 1
        child = mark['child']
        predict_module_latency_xilinx(child, config)
    elif "user" in loop_struct:
        user = loop_struct['user']
        user_expr = user['user_expr']
        config['under_unroll'] = 0
        config['under_coalesce'] = 0
        if config['module_type'] == 1:
            # For outer module, we directly return.
            config['under_serialize'] = 0
            if config['latency'] == 1:
                config['latency'] = 0
            return

        #if config['module_name'] == 'A_IO_L2_in':
        #    print(latency)
        # Set II and depth to 1 by default.
        II = 1
        depth = 1
        #print(latency, user_expr)
        if user_expr.find('dram') != -1:
            # This is a DRAM stmt, we will plug in the estimated model.
            # Extract the array name
            #module_name = config['module_name']
            #array_name = module_name.split('_')[0]
            #array_info = config['array_info'][array_name]

            if config['last_for']['under_coalesce'] == 1 and \
               config['under_serialize'] == 0:
                # This statement accesses the dram under a coalesced loop.
                burst_len = (config['last_for']['ub'] - config['last_for']['lb'])
                # The DRAM latency is etimated as 200ns
                dram_latency = 200 / config['cycle'] + burst_len + depth
                latency = latency / burst_len * dram_latency
            elif config['under_serialize'] == 1:
                # This statement accesses the dram with serialized data.
                latency = (latency - 1) * II + depth
            else:
                latency = latency * (200 / config['cycle'] + depth)
        else:
            latency = (latency - 1) * II + depth
        config['under_serialize'] = 0
        config['latency'] = latency
    elif "block" in loop_struct:
        block = loop_struct['block']
        block_child = block['child']

        # Check if only one child is valid and the rest only contain the empty leaf node.
        # If so, continue from the non-empty leaf node w/o further action.
        n_child = 0
        for child in block_child:
            is_empty = is_loop_struct_leaf_empty(child)
            if is_empty == 0:
                n_child += 1
                single_child = child

        if n_child == 1:
            predict_module_latency_xilinx(single_child, config)
            return

        # Check if the current block contains "simd" mark.
        # If so, continue from "simd" branch w/o any further action.
        simd_child = 0
        for child in block_child:
            if "mark" in child:
                mark_name = child['mark']['mark_name']
                if mark_name == 'simd':
                    config['under_unroll'] = 1
                    child = child['mark']['child']
                    simd_child = 1
                    break
        if simd_child == 1:
            predict_module_latency_xilinx(child, config)
            return

        # Proceed as normal.
        # Check if the child contains any non-simd loop. If yes, we will
        # update the loop prefix.
        for child in block_child:
            local_config = {}
            local_config['under_simd'] = 0
            has_non_simd_loop = loop_struct_has_non_simd_loop(child, local_config)
            if has_non_simd_loop:
                if config['module_type'] != 1 and config['under_loop'] == 1:
                    if config['loop_prefix'] == 'Loop':
                        config['loop_prefix'] = config['loop_prefix'] + str(config['loop_offset'])
                    else:
                        config['loop_prefix'] = config['loop_prefix'] + '.' + str(config['loop_offset'])
                break
        loop_prefix = config['loop_prefix']
        loop_offset = 1
        under_loop = config['under_loop']

        # If the block is under loop and all childrens are user nodes,
        # we will proceed and dive into the user nodes.
        all_user_child = 1
        for child in block_child:
            has_for_loop = loop_struct_has_for_loop(child)
            if has_for_loop:
                all_user_child = 0
                break
        latency = config['latency']
        block_latency = 0
        for child in block_child:
            config['loop_offset'] = loop_offset
            config['loop_prefix'] = loop_prefix
            if under_loop == 1:
                config['under_loop'] = 0
            has_for_loop = loop_struct_has_for_loop(child)
            if all_user_child:
                # Select the statement with the longest latency.
                config['latency'] = latency
                predict_module_latency_xilinx(child, config)
                block_latency = max(block_latency, config['latency'])
            else:
                # Accumulate the latency.
                if has_for_loop:
                    config['latency'] = 1
                    predict_module_latency_xilinx(child, config)
                    loop_offset += 1
                    block_latency += config['latency']
        if all_user_child:
            latency = block_latency
        else:
            latency = latency * max(block_latency, 1)
        config['latency'] = latency
    elif "if" in loop_struct:
        # For if then clause, we will treat it as similar as block by
        # adding up the latency of all sub blocks.
        latency = config['latency']
        block_latency = 0
        if_struct = loop_struct['if']
        then_block = if_struct['then']
        if config['module_type'] != 1 and config['under_loop'] == 1:
            if config['loop_prefix'] == 'Loop':
                config['loop_prefix'] = config['loop_prefix'] + str(config['loop_offset'])
            else:
                config['loop_prefix'] = config['loop_prefix'] + '.' + str(config['loop_offset'])
        loop_prefix = config['loop_prefix']
        loop_offset = config['loop_offset']
        has_for_loop = loop_struct_has_for_loop(then_block)
        if has_for_loop:
            config['latency'] = 1
            predict_module_latency_xilinx(then_block, config)
            block_latency = max(block_latency, config['latency'])
        if 'else' in if_struct:
            loop_offset += 1
            config['loop_offset'] = loop_offset
            else_block = if_struct['else']
            has_for_loop = loop_struct_has_for_loop(else_block)
            if has_for_loop:
                config['latency'] = 1
                predict_module_latency_xilinx(else_block, config)
                block_latency = max(block_latency, config['latency'])
        #print('1: ', latency)
        #print('2: ', block_latency)
        latency = latency * max(block_latency, 1)
        config['latency'] = latency

def predict_design_latency(latency_info, cycle=5, early_stop=-1):
    """ Predict the latency for a single design.

    We assume that the II and depth for each stmt to be one.

    Parameters
    ----------
    latency_info: dict
        A dict containing the latency info of the design.
    cycle: int
        The cycle time. (in ns)
    early_stop: int
        The baseline latency. If set -1, early stop is disabled.
    """
    latency_all = {}
    config = {}
    config['cycle'] = cycle
    module_grouped = latency_info['module_grouped']
    array_info = latency_info['array_info']
    loop_infos = latency_info['loop_infos']

    drain_latency = 0
    drain_outer = 1

    for module_name in module_grouped:
        if 'dummy' in module_name:
            # Simply skip the dummy module
            continue
        if module_name not in loop_infos:
            continue

        ## debug
        #if module_name != 'A_IO_L2_in':
        #    continue
        #print(module_name)
        ## debug

        module = module_grouped[module_name]
        #print(module)

        config['context'] = {}
        config['latency'] = 1
        config['loop_prefix'] = 'Loop'
        config['loop_offset'] = 1
        config['under_unroll'] = 0
        config['under_coalesce'] = 0
        config['under_serialize'] = 0
        config['under_loop'] = 0
        config['last_for'] = {}
        config['array_info'] = array_info
        config['module_name'] = module_name
        # 0: default 1: outer 2: inter_trans 3: intra_trans
        config['module_type'] = 0

        if 'inter_trans' in module or 'intra_trans' in module:
            # This is a filter module. We take it as double buffered by default.
            config['module_type'] = 1
            module_loop_info = loop_infos[module_name]
            predict_module_latency_xilinx(module_loop_info, config)
            outer_latency = config['latency']

            # inter module
            config['module_type'] = 2
            config['latency'] = 1
            config['loop_prefix'] = 'Loop'
            config['loop_offset'] = 1
            sub_module_name = module['inter_trans']
            config['module_name'] = sub_module_name
            module_loop_info = loop_infos[sub_module_name]
            predict_module_latency_xilinx(module_loop_info, config)
            inter_trans_latency = config['latency']

            # intra module
            config['module_type'] = 3
            config['latency'] = 1
            config['loop_prefix'] = 'Loop'
            config['loop_offset'] = 1
            sub_module_name = module['intra_trans']
            config['module_name'] = sub_module_name
            module_loop_info = loop_infos[sub_module_name]
            predict_module_latency_xilinx(module_loop_info, config)
            intra_trans_latency = config['latency']

            ## debug
            #print(outer_latency)
            #print(inter_trans_latency)
            #print(intra_trans_latency)
            ## debug

            if module_loop_info['module_prop']['double_buffer'] == 1:
                module_latency = outer_latency * max(inter_trans_latency, intra_trans_latency)
                if module_loop_info['module_prop']['in'] == 1:
                    module_latency += intra_trans_latency
                else:
                    module_latency += inter_trans_latency
            else:
                module_latency = outer_latency * (inter_trans_latency + intra_trans_latency)
            # Hack: For GEMM4
            #if 'C' in module_name:
            if 'drain' in module_name:
                drain_outer = max(1, outer_latency)

            latency_all[module_name] = module_latency
        else:
            module_loop_info = loop_infos[module_name]
            #print(config['module_name'])
            predict_module_latency_xilinx(module_loop_info, config)
            latency_all[module_name] = config['latency']
            # Hack: For GEMM4
            #if 'C' in module_name:
            if 'drain' in module_name:
                drain_latency = max(drain_latency, config['latency'])

        # If we set early stop, we are using a baseline latency to compare.
        # If any of the module latency is greater than the baseline, we
        # will return immediately.
        if early_stop != -1:
            if config['latency'] > early_stop:
                return config['latency']

    #print(latency_all)
    drain_last_tile_latency = drain_latency / drain_outer
    latency = 0
    for lat in latency_all:
        if latency_all[lat] > latency:
            latency = latency_all[lat]
    #print(latency)
    #print(drain_last_tile_latency)
    latency += drain_last_tile_latency

    return int(latency)

def unit_test_predict_design_latency(design_dir):
    """ Unit test for design latency prediction

    Paramters
    ---------
    design_dir: str
        Design directory
    """
    latency_info = extract_latency_info(design_dir)
    latency = predict_design_latency(latency_info, 5)
    print("latency: ", latency)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="==== AutoSA Latency Model ====")
    parser.add_argument('-d', required=True, help='design directory')

    args = parser.parse_args()
    unit_test_predict_design_latency(args.d)


================================================
FILE: autosa_scripts/module_group.py
================================================
#!/usr/bin/env python3

import sympy
import sys
import argparse
import re
import json
import numpy as np


def compose_final_file(output_f, prefix_content, module_defs, top_kernel):
    with open(output_f, 'w') as f:
        f.writelines(prefix_content)
        for module_name in module_defs:
            module_def = module_defs[module_name]
            f.write('/* Module Definition */\n')
            f.writelines(module_def)
            f.write('/* Module Definition */\n\n')

        f.writelines(top_kernel['prefix_content'])
        f.write(' ' * 4 + '/* FIFO Declaration */\n')
        for fifo_name in top_kernel['fifo_decls']:
            fifo_decl = top_kernel['fifo_decls'][fifo_name]
            f.writelines(fifo_decl)
        f.write(' ' * 4 + '/* FIFO Declaration */\n\n')

        for module_call in top_kernel['module_calls']:
            f.write(' ' * 4 + '/* Module Call */\n')
            f.writelines(module_call['content'])
            f.write(' ' * 4 + '/* Module Call */\n\n')
        f.write('}\n')
        # Note: this one is for extern "C" in the OpenCL kernel
        f.write('}\n')


def extract_fifos_from_module_call(module_call):
    """

    Returns a list containing all the fifos in the module call.
    """
    fifos = []
    for line in module_call:
        if line.find('/* fifo */') != -1:
            m = re.search(r'\*/ (.+),', line)
            if m:
                fifo = m.group(1)
                fifos.append(fifo)
            else:
                m = re.search(r'\*/ (.+)', line.strip())
                if m:
                    fifo = m.group(1)
                    fifos.append(fifo)
    return fifos


def compose_group_wrapper(
        x_start,
        y_start,
        group_modules,
        module_fifo_decls,
        module_ext_fifos):
    """ Compose the module definition of the group wrapper module

    Retuns a list [module_name, module_def, module_call]
    """
    module_name = 'PE_module_group_wrapper_' + \
        str(x_start) + '_' + str(y_start)
    # Build the module definition
    module_def = []
    # Head
    module_def.append('void ' + module_name + '(\n')
    first = 1
    for fifo in module_ext_fifos:
        fifo_name = fifo['fifo_name']
        fifo_type = fifo['fifo_type']
        if not first:
            module_def.append(',\n')
        module_def.append(' ' * 4 + fifo_type + ' &' + fifo_name)
        first = 0
    module_def.append(')\n')
    module_def.append('{\n')
    module_def.append('#pragma HLS INLINE OFF\n')
    module_def.append('#pragma HLS DATAFLOW\n')

    # fifo declarations
    module_def.append(' ' * 4 + '/* FIFO Declaration */\n')
    for fifo_name in module_fifo_decls:
        fifo_decl = module_fifo_decls[fifo_name]
        module_def += fifo_decl
    module_def.append(' ' * 4 + '/* FIFO Declaration */\n\n')

    # module calls
    for module_call in group_modules:
        content = module_call['content']
        module_def.append(' ' * 4 + '/* Module Call */\n')
        module_def += content
        module_def.append(' ' * 4 + '/* Module Call */\n\n')

    module_def.append('}\n')

    # Build the module call
    module_call = []
    module_call.append(' ' * 4 + module_name + '(\n')
    # Insert the external fifos
    first = 1
    for fifo in module_ext_fifos:
        fifo_name = fifo['fifo_name']
        if not first:
            module_call.append(',\n')
        module_call.append(' ' * 8 + '/* fifo */ ' + fifo_name)
        first = 0
    module_call.append('\n')
    module_call.append(' ' * 4 + ');\n')
    return [module_name, module_def, module_call]


def create_group_wrapper(
        x_start,
        y_start,
        group_modules,
        module_defs,
        top_kernel):
    """ Create a wrapper module for all the modules in the current group

    First figure out the internal fifos in this group.
    Internal fifos are those fifos that have been used by modules inside the
    group.
    These internal fifos will be removed from the top_kernel['fifo_decls']
    and moved inside the current wrapper module.
    Next, for the external fifos, place them in the argument lists of the current
    group.
    Append the defition of this wrapper modules to module_defs.
    Append a new module call of this wrapper module to the
    top_kernel['module_calls']
    and remove the module calls of sub modules in this group from top_kernel['module_calls'].

    Args:
      x_start: the start x index of PE module ids
      y_start: the start y index of PE module ids
      group_modules: list containing all module calls in the current group
      module_defs: dict containing the module definitions
      top_kernel: dict containing the top kernel content
    """
    # print(x_start, y_start)
    internal_fifos = []
    external_fifos = []
    for module in group_modules:
        # print(module['module_name'])
        fifos = extract_fifos_from_module_call(module['content'])
        # print(fifos)
        for fifo in fifos:
            if fifo in external_fifos:
                internal_fifos.append(fifo)
                external_fifos.remove(fifo)
            else:
                external_fifos.append(fifo)

    # Remove internal fifos from the top_kernels and place them inside the current
    # wrapper.
    module_fifo_decls = {}
    for fifo in internal_fifos:
        fifo_decl = top_kernel['fifo_decls'][fifo]
        del top_kernel['fifo_decls'][fifo]
        module_fifo_decls[fifo] = fifo_decl
    module_ext_fifos = []
    for fifo in external_fifos:
        ext_fifo_item = {}
        ext_fifo_item['fifo_name'] = fifo
        # Extract the fifo type
        fifo_decl = top_kernel['fifo_decls'][fifo]
        first_line = fifo_decl[0]
        m = re.search(r'\*/ (.+?) fifo', first_line)
        if m:
            fifo_type = m.group(1)
            ext_fifo_item['fifo_type'] = fifo_type
        module_ext_fifos.append(ext_fifo_item)

    # Compose the definition and call of the wrapper module
    [module_name, module_def, module_call] = compose_group_wrapper(
        x_start, y_start, group_modules, module_fifo_decls, module_ext_fifos)
    # Insert the new definition into the module_defs
    module_defs[module_name] = module_def

    # Remove the module calls of this group from top_kernel['module_calls']
    module_offset = len(top_kernel['module_calls'])
    for module in group_modules:
        module_offset = min(module_offset,
                            top_kernel['module_calls'].index(module))
        top_kernel['module_calls'].remove(module)
    # Insert a new module call at the position 'module_offset'
    module_call_item = {'module_name': module_name, 'content': module_call}
    top_kernel['module_calls'].insert(module_offset, module_call_item)


def module_grouping(
        output_f,
        prefix_content,
        module_defs,
        top_kernel,
        group_config):
    """

    Args:
      output_f: output kernel file
      prefix_content: list containing the file content before the first module
      definition
      module_defs: dict containing the module definitions
      top_kernel: dict containing the top kernel content
      {
        'prefix_content': list containign the file content before the first fifo declaration
        'fifo_decls': dict containing the fifo declarations
        'module_calls': list containing the module calls
                        a module call is a dict containing fields:
                        module_name, module_ids, content
      }
    """
    # Examine if this file is legal to be grouped
    # Currently, we only allow module ids and fifos in the PE-level modules
    group_legal = True
    module_calls = top_kernel['module_calls']
    for module_call in module_calls:
        module_name = module_call['module_name']
        if 'PE' in module_name or 'IO_L1' in module_name:
            # This is a PE-level module
            module_call_content = module_call['content']
            for i in range(1, len(module_call_content)):
                line = module_call_content[i]
                m = re.search(r'/\* (.+?) \*/', line)
                if m:
                    arg_type = m.group(1)
                    if arg_type != 'module id' and arg_type != 'fifo':
                        group_legal = False
                        break
                    if arg_type == 'module id':
                        # Extract the module id
                        m = re.search(r'\*/ (.+?),', line)
                        if m:
                            module_id = m.group(1)
                            module_call['module_ids'].append(int(module_id))

    if not group_legal:
        print(
            '[AutoSA] Error: Unable to group modules. PE-level modules contain non-fifo' +
            ' or non-module-id arguments.\n')
        compose_final_file(output_f, prefix_content, module_defs, top_kernel)
        return

    # Extract the PE grid size
    grid_x = 0
    grid_y = 0
    pe_dim = 0
    for module_call in module_calls:
        module_name = module_call['module_name']
        if 'PE' in module_name and 'dummy' not in module_name:
            pe_dim = len(module_call['module_ids'])
            grid_x = max(module_call['module_ids'][0], grid_x)
            grid_y = max(module_call['module_ids'][1], grid_y)
    # TODO: At present, this scripts only work for 2D arrays
    grid_x += 1
    grid_y += 1
    group_modules_list = []
    for x_start in range(0, grid_x, group_config['x']):
        for y_start in range(0, grid_y, group_config['y']):
            # Grasp all the PE-level modules in the current group
            group_modules = []
            for module_call in module_calls:
                module_name = module_call['module_name']
                if 'PE' in module_name and 'dummy' not in module_name:
                    if module_call['module_ids'][0] in range(
                            x_start,
                            x_start +
                            group_config['x']) and module_call['module_ids'][1] in range(
                            y_start,
                            y_start +
                            group_config['y']):
                        group_modules.append(module_call)
                if 'IO_L1' in module_name:
                    # Extract the PE module ids from the last fifo
                    module_call_content = module_call['content']
                    last_fifo_line = module_call_content[-2]
                    m = re.search(r'\*/ (.+)', last_fifo_line.strip())
                    if m:
                        last_fifo = m.group(1)
                        last_fifo = last_fifo.split('_')
                        pe_x = int(last_fifo[-2])
                        pe_y = int(last_fifo[-1])
                        if pe_x in range(
                                x_start,
                                x_start +
                                group_config['x']) and pe_y in range(
                                y_start,
                                y_start +
                                group_config['y']):
                            group_modules.append(module_call)

            group_modules_list.append({'x_start': x_start, 'y_start': y_start,
                                       'group_modules': group_modules.copy()})

    for group in group_modules_list:
        # Create group wrapper modules
        create_group_wrapper(group['x_start'], group['y_start'],
                             group['group_modules'], module_defs, top_kernel)

    # Compose the final file
    compose_final_file(output_f, prefix_content, module_defs, top_kernel)


def run(input_f, output_f, config, host='opencl'):
    """ Module group

    This function will group the PE-level modules (PE and IO_L1)
    according to the group configuration files.
    Specifically, given the grouping constraint {x, y}, we will group all PE-level
    modules into blocks with dimensions x and y.
    We will insert new wrapper functions to wrap the original modules in the
    group.
    FIFOs connecting these modules internally will be placed inside the wrapper.

    Note: This script only supports:
          - 2D array
          - Xilinx OpenCL kernel

    Args:
      input_f: input kernel file
      output_f: output kernel file
      config: grouping configuration file
      host: Xilinx host target
    """
    # Load the group configuration file
    group_config = {}
    with open(config, 'r') as f:
        group_config = json.load(f)

    # Extract:
    # - file content before the first module definition
    # - module definitions
    # - top kernel
    #   - file content before the first fifo declaration
    #   - fifo declarations
    #   - module calls
    #   - fifo content after the last module call
    lines = []
    with open(input_f, 'r') as f:
        lines = f.readlines()

    prefix_content = []
    module_defs = {}
    top_kernel = {'prefix_content': [], 'fifo_decls': {}, 'module_calls': []}
    prefix_content_flag = 1
    module_defs_flag = 0
    top_kernel_flag = 0
    module_def_add = False
    module_def = []

    top_kernel_prefix_content_flag = 1
    top_kernel_fifo_decls_flag = 0
    top_kernel_module_calls_flag = 0
    top_kernel_fifo_decls_add = False
    top_kernel_module_calls_add = False
    module_call = []

    for line in lines:
        if prefix_content_flag:
            if line.find('Module Definition') != -1:
                prefix_content_flag = 0
                module_defs_flag = 1
            else:
                prefix_content.append(line)
        if module_defs_flag:
            if line.find('extern \"C\"') != -1:
                # TODO: only opencl is supported
                module_defs_flag = 0
                top_kernel_flag = 1
            else:
                if module_def_add:
                    module_def.append(line)
                    if (line.find('void')) != -1:
                        m = re.search(r'void (.+?)\(', line.strip())
                        if m:
                            module_name = m.group(1)
                if line.find('/* Module Definition */') != -1:
                    if module_def_add:
                        module_def.pop(len(module_def) - 1)
                        module_defs[module_name] = module_def.copy()
                        module_def.clear()
                    module_def_add = not module_def_add
        if top_kernel_flag:
            if top_kernel_prefix_content_flag:
                if line.find('/* FIFO Declaration */') != -1:
                    top_kernel_prefix_content_flag = 0
                    top_kernel_fifo_decls_flag = 1
                else:
                    top_kernel['prefix_content'].append(line)
            if top_kernel_fifo_decls_flag:
                if line.find('/* FIFO Declaration */') != -1:
                    if not top_kernel_fifo_decls_add:
                        top_kernel_fifo_decls_add = not top_kernel_fifo_decls_add
                    else:
                        top_kernel_fifo_decls_flag = 0
                        top_kernel_module_calls_flag = 1
                else:
                    if line.find('hls::stream') != -1:
                        m = re.search(r'> (.+?);', line)
                        if m:
                            fifo_name = m.group(1)
                            top_kernel['fifo_decls'][fifo_name] = [line]
                    if line.find('HLS STREAM') != -1:
                        m = re.search(r'variable=(.+?) ', line)
                        if m:
                            fifo_name = m.group(1)
                            top_kernel['fifo_decls'][fifo_name].append(line)
            if top_kernel_module_calls_flag:
                if line.find('/* Module Call */') != -1:
                    if top_kernel_module_calls_add:
                        module_call_object = {'module_name': module_name,
                                              'module_ids': [],
                                              'content': module_call.copy()}
                        top_kernel['module_calls'].append(module_call_object)
                        module_call.clear()
                    top_kernel_module_calls_add = not top_kernel_module_calls_add
                else:
                    if top_kernel_module_calls_add:
                        module_call.append(line)
                        m = re.search(r'(.+?)\(', line.strip())
                        if m:
                            module_name = m.group(1)

    # Group modules and print out to 'output_f'
    module_grouping(
        output_f,
        prefix_content,
        module_defs,
        top_kernel,
        group_config)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='==== AutoSA Utils: Module Grouping ====')
    parser.add_argument('-i', '--input', required=True, help='kernel file')
    parser.add_argument(
        '-o',
        '--output',
        required=True,
        help='modified kernel file')
    parser.add_argument(
        '-c',
        '--config',
        required=True,
        help='grouping configuration')
    parser.add_argument(
        '--host',
        required=False,
        help='Xilinx host target: hls|opencl',
        default='opencl')

    args = parser.parse_args()
    run(args.input, args.output, args.config, args.host)


================================================
FILE: autosa_scripts/odyssey/RL_utils.py
================================================
import torch.nn as nn
import numpy as np
import random
import bisect
import copy

import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

import utils

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
LR_ACTOR = 1e-3 # learning rate of the actor
GAMMA = 0.9  # discount factor
EPSILON = 2**(-12)
CLIPPING_MODEL = 100

class RLEnv():
    def __init__(self, search_task, cst, param_idx_map, idx_param_map, search_obj, dim_size, n_action_steps, action_size):
        """    
        search_task: search task object
        dim_size: dimension of the problem space, 3 for GEMM
        n_action_steps: dimension of the action vector, 6 for GEMM
        """
        self.search_task = search_task
        self.cst = cst
        self.param_idx_map = param_idx_map
        self.idx_param_map = idx_param_map
        self.search_obj = search_obj
        self.dim_size = dim_size
        self.n_action_steps = n_action_steps
        self.action_size = action_size
        action_bound, action_bottom = self.build_action_space()
        self.action_bound = action_bound
        self.action_bottom = action_bottom
        
        self.state = np.array([0.5]*n_action_steps) # (action vector)        
        # Sum of adjusted rewards
        self.adjusted_epoch_rewards = 0        
        # Sum of raw rewards
        self.epoch_rewards = 0
        self.prev_epoch_rewards = 0        
        self.sig = 1
        # The minimal reward during the whole training process
        self.min_reward = float("inf")
        self.epoch = 0
        self.best_epoch_rewards = float("-inf")        
        # Keep track of best rewards during the training process
        self.rewards_log = []
        self.best_rewards_log = []

    def reset(self):
        """ Reset the state of the environment.

        """
        # (i_t1, j_t1, k_t1, i_t2, j_t2, k_t2)
        self.state = np.array([0]*6, dtype=np.float)
        self.adjusted_epoch_rewards = 0        
        self.epoch_rewards = 0        
        self.sig = 1
        self.sol = []
        infos = {}

        return self.state, infos

    def get_state(self):
        return self.state

    def set_constraint(self, cst):
        """ Set up hw constraint.
        """
        self.cst = cst

    def build_action_space(self):
        action_bound = [self.search_task.workload["params"]["i"], 
                        self.search_task.workload["params"]["j"], 
                        self.search_task.workload["params"]["k"], 
                        self.search_task.workload["params"]["i"], 
                        self.search_task.workload["params"]["j"], 
                        min(256 // self.search_task.dw, 64, self.search_task.workload["params"]["k"])]
        action_bottom = [1 for a in range(self.n_action_steps)]
        return action_bound, action_bottom

    def overuse_constraint(self, used_cst):
        score = 0
        if not used_cst:
            # If constraint doesn't exist, return True to exclude this design
            return True, score

        overuse = False

        if used_cst['BRAM18K'] > self.cst.hw_cst['BRAM18K']:
            score += 0.5 * (used_cst['BRAM18K'] - self.cst.hw_cst['BRAM18K']) / self.cst.hw_cst['BRAM18K']
            overuse = True
        if used_cst['URAM'] > self.cst.hw_cst['URAM']:
            score += 0.5 * (used_cst['URAM'] - self.cst.hw_cst['URAM']) / self.cst.hw_cst['URAM']
            overuse = True    
        if used_cst['DSP'] > self.cst.hw_cst['DSP']:
            score += 0.5 * (used_cst['DSP'] - self.cst.hw_cst['DSP']) / self.cst.hw_cst['DSP']
            overuse = True
        
        return overuse, score

    def update_total_reward_constraint(self, constraint, reward):
        """ Accumulate the resource and rewards in one epoch.
        Currently we only consider rewards.
        """        
        self.epoch_rewards += reward

    def get_reward(self, task_params):
        """ Call the cost models to get the reward for current solution.

        Returns
        -------
        reward:
            The adjusted reward for the current solution.
        constraint:
            The used constraint of the current solution.
        reward_raw:
            The unadjusted reward for the current solution.
        """
        reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
        #reward, constraint = self.search_task.evaluate(sol)        
        if reward == None or reward == 0:
            return -1, None, -1, None        
        reward_raw = reward        
        self.min_reward = min(self.min_reward, reward_raw)
        # Adjust the reward by subtracting the minimal reward found so far
        # to stabilize the training.
        reward -= self.min_reward
        self.adjusted_epoch_rewards += reward

        return reward, used_constraint, reward_raw, reward_meta

    def norm_state(self, T):
        """ Normalize the state to the range of [-1, 1] to stabilize the training.
        The input state is in the range of [0, 1].
        """
        T[:-1] = (T[:-1] - 0.5) * 2
        return T

    def update_mode_and_step(self):
        pass        

    def update_reward_epoch(self):
        if self.epoch_rewards > self.best_epoch_rewards:
            self.best_epoch_rewards = self.epoch_rewards

    def update_best_reward_list(self, succeed):
        """ Update the information
        """
        self.epoch += 1
        # If the current epoch fails, we roll back to the reward in the last successful epoch.
        self.epoch_rewards = self.prev_epoch_rewards if not succeed else self.prev_epoch_rewards
        self.prev_epoch_rewards = self.epoch_rewards
        self.rewards_log.append(self.epoch_rewards)
        self.best_rewards_log.append(self.best_epoch_rewards)

    def update_reward_impt(self, done):
        impt = None        
        return impt

    def convert_action_to_vals(self, action):
        """ Convert the actions to the real tiling factors.
        """
        action_norm = np.array([float(a) / self.action_size for a in action]).clip(0, 1)
        # i_t1, j_t1, k_t1
        for i in range(3):
            action[i] = int(action[i] / self.action_size * self.action_bound[i])
        # i_t2, j_t2, k_t2
        for i in range(3):
            action[i] = int(action[i] / self.action_size * self.action_bound[i])

        task_params = {}
        for p, param in self.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = action[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        task_params = self.search_task.adjust_params(task_params)
        task_params = self.search_task.design.infer_params(task_params)

        action = []
        for p, param in self.search_task.design.params_config["tunable"].items():
            action.append(task_params[param["name"]])

        return action, action_norm

    def step(self, action):
        infos = {}
        infos['succeed'] = 0        
        done = 0
        action = action.cpu().numpy().flatten()        
        # Scale the action back to the real tiling factors
        # Actions are in the levels of 1 to self.action_size.
        # We will need to scale them back to the corresponding tiling factors.
        action_val = [int(a) + 1 for a in action]
        action_val, action_norm = self.convert_action_to_vals(action_val)        
        # Compose the solution
        task_params = {}
        for p, param in self.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = action_val[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        task_params = self.search_task.adjust_params(task_params)
        task_params = self.search_task.design.infer_params(task_params)
        infos['sol'] = task_params
        reward, used_constraint, reward_raw, reward_meta = self.get_reward(task_params)
        infos['cst'] = used_constraint        
        infos['reward_meta'] = reward_meta
        self.update_total_reward_constraint(used_constraint, reward_raw)
        self.sol.append((copy.deepcopy(action_val)))

        # Penalize the solution that overuses the resource
        overuse, overuse_score = self.overuse_constraint(used_constraint)
        if overuse:            
            reward = (-self.adjusted_epoch_rewards + reward) * overuse_score
            reward_raw = 0
            done = 1
        if reward == -1:
            done = 1
        infos['reward_raw'] = reward_raw

        # Normalize the state to [-1,1]
        self.state = self.norm_state(action_norm)
        if not done:
            infos["succeed"] = 1
            done = 1
            self.update_reward_epoch()
        infos["epoch_rewards"] = self.epoch_rewards
        self.update_best_reward_list(infos["succeed"]) if done else None        
        impt = self.update_reward_impt(done)        
        return self.state, reward, done, infos, self.sig, impt

class RLAgent():
    def __init__(self, dim_size, n_action_steps, action_size, seed, batch, decay=0.95):
        """
        Parameters
        ----------
        dim_size:
            problem space dimensions, 3 for GEMM
        n_action_steps:
            dimension of one action, 6 for GEMM
        action_size:
            levels of each action step in one action
        seed:
            random seed
        """
        # Attributes
        self.dim_size = dim_size
        self.action_size = action_size
        self.n_action_steps = n_action_steps        
        # [n_action_steps]
        self.state_div = [n_action_steps]
        self.state_div = np.cumsum(self.state_div)
        self.seed = random.seed(seed)
        self.batch = batch

        # Actor
        self.actor = Actor(dim_size, n_action_steps, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer, factor=0.9, min_lr=1e-6)
        self.decay = decay
        self.epoch = 0

        # Book-keeping
        self.saved_log_probs = []
        self.rewards = []
        self.baseline = None
        self.lowest_reward = 0
        self.best_epoch_reward = float("-Inf")
        self.has_succeeed_history = False
        self.bad_counts = 0

    def reset(self):
        """ Rest the internal status
        """
        self.saved_log_probs = []
        self.rewards = []

    def adjust_lr(self, ratio, min_lr=1e-8):
        """ Adjust the learning rate
        """
        for param_group in self.actor_optimizer.param_groups:
            param_group['lr'] = max(min_lr, param_group['lr'] * ratio)

    def act(self, state, infos, eps=0.0, temperature=1):
        """ Perform one action given the current state.
        """        
        actions = state[0:self.state_div[0]]

        # Convert them to pytorch data structs        
        actions = torch.from_numpy(actions).type(torch.FloatTensor).to(device)        

        # Run the policy network        
        (p) = self.actor(actions, temperature=temperature)
        #print(p)
        m = Categorical(p)
        #print(m)
        action = m.sample()
        
        if random.random() < eps:
            action2 = action.data + 1 if random.random() < 0.5 else action.data - 1
            action2 = torch.from_numpy(np.array([action2]))
            action2 = torch.clamp(action2, 0, p.size(1)-1)
            return action2.data, m.log_prob(action2)
        else:
            return action.data, m.log_prob(action)

    def step(self, state, actions, log_prob, reward, next_state, done, sig, impt, infos):
        """ Update and train the policy network
        """
        self.rewards.append(reward)
        #print('returned', reward)
        self.saved_log_probs.append(log_prob)        
        self.epoch += 1        
        if self.epoch == self.batch:
            self.learn(GAMMA, impt, infos)

    def impt_adj_reward(self, reward, impt):
        """ Adjust the rewards
        """
        if impt is not None:
            reward[:len(impt)] = reward[:len(impt)] * impt
        return reward

    def learn(self, gamma, impt, infos):
        """ Train the policy network
        """
        rewards = np.array(self.rewards)
        # Normalize the rewards
        rewards = (rewards - rewards.mean()) / (rewards.std() + EPSILON)
        # Adjust the rewards
        rewards = self.impt_adj_reward(rewards, impt)
        dis_rewards = []
        R = 0
        for r in rewards[::-1]:
            R = r + gamma * R
            dis_rewards.insert(0, R)
        dis_rewards = np.array(dis_rewards)
        dis_rewards = (dis_rewards - dis_rewards.mean()) / (dis_rewards.std() + EPSILON)

        policy_loss = []
        for log_prob, r in zip(self.saved_log_probs, dis_rewards):
            policy_loss.append(-log_prob * r)
        policy_loss = torch.cat(policy_loss).sum()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), CLIPPING_MODEL)
        self.actor_optimizer.step()
        self.reset()

class Actor(nn.Module):
    def __init__(self, dim_size, n_action_steps, action_size, seed, h_size=128, hidden_dim=10):
        """
        We implement a simple FC networks as the policy network.

        Parameters
        ----------
        dim_size:
            The problem space dimension
        n_action_steps:
            Number of the action steps, 6 for GEMM (i_t1, j_t1, k_t1, i_t2, j_t2, k_t2)
        action_size:
            Level of action steps, max(i, j, k) for GEMM
        h_size:
            FC layer dimension
        hidden_dim:
            dimensions of the encoder layer
        """
        super().__init__()
        self.seed = torch.manual_seed(seed)

        # Encoder
        self.encoder_action = nn.Linear(n_action_steps, hidden_dim)

        # hidden_dim -> h_size
        self.fc11 = nn.Linear(hidden_dim, h_size)
        # h_size -> h_size
        self.fc12 = nn.Linear(h_size, h_size)
        # h_size -> h_size
        self.fc13 = nn.Linear(h_size, h_size)

        self.fc21 = nn.Linear(h_size, action_size)
        self.fc22 = nn.Linear(h_size, action_size)
        self.fc23 = nn.Linear(h_size, action_size)
        self.fc24 = nn.Linear(h_size, action_size)
        self.fc25 = nn.Linear(h_size, action_size)
        self.fc26 = nn.Linear(h_size, action_size)

        self.output1 = nn.Linear(action_size, action_size)
        self.output2 = nn.Linear(action_size, action_size)
        self.output3 = nn.Linear(action_size, action_size)
        self.output4 = nn.Linear(action_size, action_size)
        self.output5 = nn.Linear(action_size, action_size)
        self.output6 = nn.Linear(action_size, action_size)

        self.decoder = [self.fc21, self.fc22, self.fc23, self.fc24, self.fc25, self.fc26]
        self.n_action_steps = n_action_steps

    def forward(self, action_steps, temperature=1):
        """
        Network forward inference.

        Paramters
        ---------
        """
        x1 = self.encoder_action(action_steps)
        x1 = x1.unsqueeze(0)

        x = x1
        x = F.relu(self.fc11(x))
        x = F.relu(self.fc12(x))
        x = F.relu(self.fc13(x))

        # i1
        decoder = self.decoder[0]
        x1 = F.relu(decoder(x))
        x1 = self.output1(x1)
        x1 = F.softmax(x1/temperature, dim=1)

        # j1
        decoder = self.decoder[1]
        x2 = F.relu(decoder(x))
        x2 = self.output2(x2)
        x2 = F.softmax(x2/temperature, dim=1)

        # k1
        decoder = self.decoder[2]
        x3 = F.relu(decoder(x))
        x3 = self.output3(x3)
        x3 = F.softmax(x3/temperature, dim=1)

        # i2
        decoder = self.decoder[3]
        x4 = F.relu(decoder(x))
        x4 = self.output4(x4)
        x4 = F.softmax(x4/temperature, dim=1)

        # j2
        decoder = self.decoder[4]
        x5 = F.relu(decoder(x))
        x5 = self.output5(x5)
        x5 = F.softmax(x5/temperature, dim=1)

        # k2
        decoder = self.decoder[5]
        x6 = F.relu(decoder(x))
        x6 = self.output6(x6)
        x6 = F.softmax(x6/temperature, dim=1)

        # Return the concatenated (x1, x2, x3, x4, x5, x6)
        x = torch.cat((x1, x2, x3, x4, x5, x6), dim=0)

        return (x)

================================================
FILE: autosa_scripts/odyssey/analyze.py
================================================
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import os
import re
import scipy

folder = "resnet50_array24"

design_info = {}
with open(f"{folder}/history.log") as f:
    lines = f.readlines()
    design_idx = 0
    design_lines = []
    start_end = []
    for line_idx in range(len(lines)):
        line = lines[line_idx]
        if line.find(f"<record{design_idx}><begin>") != -1:
            start_end.append(line_idx)
        if line.find("arch sol") != -1:
            start_end.append(line_idx)
        if line.find(f"<record{design_idx}><end>") != -1:
            start_end.append(line_idx)
            design_lines.append(start_end)
            start_end = []
            design_idx += 1    
    layer_infos = []
    layer_info = {}
    for design_idx in range(len(design_lines)):
        for line_idx in range(design_lines[design_idx][0], design_lines[design_idx][1]):
            line = lines[line_idx]            
            if line.find("latency") != -1 and 'latency' not in layer_info:
                layer_info["latency"] = float(line.split(":")[-1].strip().strip(','))                
            if line.find("DSP efficiency") != -1:
                layer_info["DSP_eff"] = float(line.split(":")[-1].strip().strip(','))
            if line.find("CTC(FLOP/byte)") != -1:
                layer_info["CTC"] = float(line.split(":")[-1].strip().strip(','))
            if line.find("design") != -1:
                layer_info["design"] = line.split(":")[-1].strip().strip(',')
                dataflow_idx = layer_info["design"][6:]                
                layer_infos.append(layer_info)
                layer_info = {}    
    design_info["array_infos"] = layer_infos

    # Extract the last array
    layer_infos = []
    layer_info = {}
    #print(design_lines[-1][1], design_lines[-1][2])
    for line_idx in range(design_lines[-1][1], design_lines[-1][2]):
        line = lines[line_idx]
        if line.find("\'sol\':") != -1:
            layer_infos.append(layer_info)
            layer_info = {}
        if line.find("\'latency\':") != -1 and 'latency' not in layer_info:
            layer_info["latency"] = float(line.split(":")[-1].strip().strip(','))            
        if line.find("CTC") != -1:
            layer_info["CTC"] = float(line.split(":")[-1].strip().strip(','))
        if line.find("DSP_eff") != -1:
            layer_info["DSP_eff"] = float(line.split(":")[-1].strip().strip(','))            
    design_info["last_array_info"] = layer_infos

# Plot
dict_data = {"Latency": [], "DSP Eff": [], "CTC": [], "Layer": []}
layer_idx = 0
for idx in range(len(design_info["array_infos"]) - 1):
    layer_info = design_info["array_infos"][idx]
    dict_data["Latency"].append(layer_info["latency"])
    dict_data["DSP Eff"].append(layer_info["DSP_eff"])
    dict_data["CTC"].append(layer_info["CTC"])
    dict_data["Layer"].append(layer_idx + 1)
    layer_idx += 1
for idx in range(len(design_info["last_array_info"])):
    layer_info = design_info["last_array_info"][idx]
    #print(layer_info)
    dict_data["Latency"].append(layer_info["latency"])
    dict_data["DSP Eff"].append(layer_info["DSP_eff"])
    dict_data["CTC"].append(layer_info["CTC"])
    dict_data["Layer"].append(layer_idx + 1)
    layer_idx += 1
print("max CTC: ", max(dict_data["CTC"]))
print("max latency: ", max(dict_data["Latency"]))

df = pd.DataFrame.from_dict(dict_data)
sns.set_theme()
sns.set(rc={'figure.figsize':(20,5)})

'''
g = sns.lineplot(
    data=df,
    x="Layer", y="Latency", markers=True
)
g.set(xticks=df.Layer.values)
plt.xlabel("Layer")
plt.ylabel("Latency")
g.figure.savefig("network_latency_cmp")

g = sns.lineplot(
    data=df,
    x="Layer", y="DSP Eff", markers=True
)
g.set(xticks=df.Layer.values)
plt.xlabel("Layer")
plt.ylabel("DSP Eff")
plt.ylim(0, 1.1)
g.figure.savefig("network_dsp_eff_cmp")
'''

g = sns.lineplot(
    data=df,
    x="Layer", y="CTC", markers=True
)
g.set(xticks=df.Layer.values)
plt.xlabel("Layer")
plt.ylabel("CTC")
plt.ylim(0, 250)
g.figure.savefig("network_ctc_cmp")


================================================
FILE: autosa_scripts/odyssey/clean_up.sh
================================================
#!/bin/bash

rm -rf db/*
rm -rf opentuner.db
rm -rf outdir/*
rm -rf __pycache__
rm -rf tmp/*


================================================
FILE: autosa_scripts/odyssey/cst/hw_cst.json
================================================
{
  "BRAM18K": {
    "total": 5376,
    "ratio": 0.7
  },
  "DSP": {
    "total": 12288,
    "ratio": 0.7
  },
  "FF": {
    "total": 3456000,
    "ratio": 0.7
  },
  "LUT": {
    "total": 1728000,
    "ratio": 0.7
  },
  "URAM": {
    "total": 1280,
    "ratio": 0.7
  }
}


================================================
FILE: autosa_scripts/odyssey/cst/single_test.json
================================================
{
  "BRAM18K": {
    "total": 300,
    "ratio": 1.0
  },
  "DSP": {
    "total": 800,
    "ratio": 1.0
  },
  "FF": {
    "total": 3456000,
    "ratio": 0.7
  },
  "LUT": {
    "total": 1728000,
    "ratio": 0.7
  },
  "URAM": {
    "total": 1280,
    "ratio": 0.7
  }
}


================================================
FILE: autosa_scripts/odyssey/cst/u250.json
================================================
{
  "BRAM18K": {
    "total": 5376,
    "ratio": 0.7
  },
  "DSP": {
    "total": 12288,
    "ratio": 0.7
  },
  "FF": {
    "total": 3456000,
    "ratio": 0.7
  },
  "LUT": {
    "total": 1728000,
    "ratio": 0.7
  },
  "URAM": {
    "total": 1280,
    "ratio": 0.7
  }
}


================================================
FILE: autosa_scripts/odyssey/cst/vu9p.json
================================================
{
  "BRAM18K": {
    "total": 4318,
    "ratio": 0.7
  },
  "DSP": {
    "total": 6840,
    "ratio": 0.7
  },
  "URAM": {
    "total": 960,
    "ratio": 0.7
  }
}


================================================
FILE: autosa_scripts/odyssey/design.py
================================================
import numpy as np
import json
import sys
import os
from numpy import ceil, floor

class Design(object):
    def __init__(self, name):
        self.name = name # design name        
        self.est_resource_func = None
        self.est_latency_func = None
        self.est_activity_func = None
        self.infer_params_func = None
        self.random_sampling_func = None
        self.bound_check_func = None
        self.params_config = None      
        self.desp = None  

    def print_resource_est_func(self, f, desp):
        f.write("def est_resource(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        f.write("\t# DSP\n")
        f.write(f"\tDSP = {desp['compute']['PE']['num']} * ")
        f.write(f"{desp['compute']['PE']['unroll_factor']} * ")
        if desp["compute"]["PE"]["ele_type"] == "float":
            f.write(f"5\n")
        else:
            raise RuntimeError(f"Unsupported data type {desp['compute']['PE']['ele_type']} in resource estimation")        
        f.write("\n")

        # Print function est_BRAM18K
        f.write("\t# BRAM18K\n")
        f.write("\tdef est_BRAM18K(ele_size, ele_num, pack):\n")
        #f.write(f"\t\treturn ceil(ele_size*8*pack / 18) * ceil(ele_num/pack/1024)\n\n")
        f.write(f"\t\treturn ceil(ele_size*8*pack / 36) * ceil(ele_num/pack/512)\n\n")

        f.write(f"\tres_meta = {{}}\n")
        # Check if drain module can be merged.
        # Note: It should be supported in the codegen of AutoSA. However, currently, 
        # we move it here in the tuner.
        mem_meta_info = {}
        out_module = {}
        out_drain_module = {}
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.endswith('_out'):
                item = {'buf_size': module_mem['buf_size'], 
                        'num': module_mem['num']}
                if module.find('drain') != -1:
                    item['merged'] = 0
                    out_drain_module[module_mem['array']] = item
                else:                    
                    if module_mem['array'] not in out_module:
                        out_module[module_mem['array']] = [item]
                    else:
                        out_module[module_mem['array']].append(item)
        for array in out_drain_module:
            if array in out_module:
                for m in out_module[array]:                
                    if m['buf_size'] == out_drain_module[array]['buf_size'] and \
                       m['num'] == out_drain_module[array]['num']:
                       out_drain_module[array]['merged'] = 1

        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            f.write(f"\t{module}_unit_memory = est_BRAM18K({module_mem['ele_size']}, ")
            f.write(f"{module_mem['buf_size']}, ")
            if "data_pack_factor_inter" in module_mem:
                f.write(f"{module_mem['data_pack_factor_inter']})\n")
            else:
                f.write(f"1)\n")
            f.write(f"\tres_meta[\"{module}\"] = {{\"ele_size\": {module_mem['ele_size']}, \"buf_size\": {module_mem['buf_size']}, \"data_pack_factor\": 1, \"num\": {module_mem['num']}}}\n")
            if module_mem['double_buffer']:
                f.write(f"\tres_meta[\"{module}\"][\"num\"] *= 2\n")
            if "data_pack_factor" in module_mem:
                f.write(f"\tres_meta[\"{module}\"][\"data_pack_factor\"] = {module_mem['data_pack_factor_inter']}\n")
        #f.write("\tprint(A_IO_L1_in_unit_memory)\n")
        #f.write("\tprint(A_IO_L2_in_unit_memory)\n")
        #f.write("\tprint(B_IO_L2_in_unit_memory)\n")        
        #f.write("\tprint(PE_unit_memory)\n")
        #f.write("\tprint(C_1_IO_L2_out_unit_memory)\n")        
        #f.write("\tprint(C_drain_IO_L1_out_unit_memory)\n")

        f.write("\tBRAM18K = ")
        is_first = True
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            if not is_first:
                f.write(" + ")            
            f.write(f"{module}_unit_memory")
            if module_mem["double_buffer"]:
                f.write(f" * 2")
            else:
                f.write(f" * 1")
            f.write(f" * {module_mem['num']}")            
            is_first = False            
        f.write("\n\n")

        f.write("\t# URAM\n")
        f.write("\tURAM = 0\n\n")

        f.write("\tres = {\"DSP\": DSP, \"BRAM18K\": BRAM18K, \"URAM\": URAM}\n")
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            f.write(f"\tres['{module}_unit_memory'] = {module}_unit_memory\n")

        f.write("\n\treturn res, res_meta\n")
        f.write("\n")

    def print_latency_est_func(self, f, desp):
        f.write("def est_latency(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        def extract_latency_expr(lat, info):
            ret = ""
            if lat["type"] == "block":
                info["has_for_child"] = 0
                no_for_child = True
                is_first = True
                ret += "("
                for child in lat["child"]:
                    if not is_first:
                        ret += " + "                    
                    ret += extract_latency_expr(child, info)                    
                    if info["has_for_child"] == 1:
                        no_for_child = False
                    is_first = False
                ret += ")"
                if no_for_child:
                    ret = "1"
            elif lat["type"] == "for":                
                child = lat["child"]
                expr = extract_latency_expr(child, info)                
                if info["valid"]:
                    ret = lat["bounds"][1] + " * " + expr
                else:
                    ret = expr
                info["has_for_child"] = 1
            elif lat["type"] == "mark":      
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = True
                if lat["content"] == "simd":
                    if info["valid"]:
                        ret = "1"
                    else:
                        ret = "0"
                else:
                    child = lat["child"]
                    ret = extract_latency_expr(child, info)
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = False
            elif lat["type"] == "user":
                user_expr = lat["child"]["user_expr"]
                if 'inter_intra' in user_expr or 'intra_inter' in user_expr:                    
                    if user_expr[:-2].split(".")[-1] == "1":
                        double_buffer = 1
                    else:
                        double_buffer = 0                    
                    # Plug in submodule latency
                    if f"{info['name']}_inter" in info["modules"]:
                        inter_expr = info["modules"][f"{info['name']}_inter"]
                    else:
                        inter_expr = None
                    if f"{info['name']}_intra" in info["modules"]:
                        intra_expr = info["modules"][f"{info['name']}_intra"]
                    else:
                        intra_expr = None

                    if inter_expr and intra_expr:
                        if info["in"] == 1 or info["in"] == 0:
                            ret = inter_expr
                        else:
                            if double_buffer:
                                ret = f"max({inter_expr}, {intra_expr})"
                            else:
                                ret = f"({inter_expr} + {intra_expr})"
                        info["has_for_child"] = 1
                    else:                        
                        ret = "1"                        
                    if not info["valid"]:
                        ret = "0"
                elif "inter_trans" in user_expr:
                    # Plug in submodule latency
                    if f"{info['name']}_inter" in info["modules"]:
                        ret = info["modules"][f"{info['name']}_inter"]
                    else:
                        ret = "1"
                    if not info["valid"]:
                        ret = "0"
                elif "intra_trans" in user_expr:
                    # Plug in submodule latency                    
                    if f"{info['name']}_intra" in info["modules"]:
                        ret = info["modules"][f"{info['name']}_intra"]
                    else:
                        ret = "1"
                    if not info["valid"]:
                        ret = "0"
                else:
                    ret = "1"
            elif lat["type"] == "if":
                # Only examine the first child
                child = lat["child"][0]
                ret = extract_latency_expr(child, info)
            elif lat["type"] == "array_tile":      
                if info["module_attr"]["to_dram"] == 1:
                    if info["module_attr"]["serialize"] == 0:
                        # Consider the DRAM latency here.
                        ret = "(" + f"{lat['size']}/{lat['last_dim']}*(20+{lat['last_dim']}/(512/8/{lat['ele_size']}))" + ")"
                    else:
                        ret = "(" + lat["size"] + "/" + f"min({lat['data_pack_factor']}, 512/8/{lat['ele_size']})" + ")"
                else:
                    ret = "(" + lat["size"] + "/" + lat["data_pack_factor"] + ")"                    
            else:
                raise RuntimeError(f"Unsupported latency node type {lat['type']}")

            return ret

        # Check if drain module can be omitted
        # Note: It should be supported in the codegen of AutoSA. However, currently,
        # we move it here in the tuner.        
        out_module = {}
        out_drain_module = {}
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.endswith('_out'):
                item = {'buf_size': module_mem['buf_size'], 
                        'num': module_mem['num']}
                if module.find('drain') != -1:
                    item['merged'] = 0
                    out_drain_module[module_mem['array']] = item
                else:                    
                    if module_mem['array'] not in out_module:
                        out_module[module_mem['array']] = [item]
                    else:
                        out_module[module_mem['array']].append(item)
        for array in out_drain_module:
            if array in out_module:
                for m in out_module[array]:                
                    if m['buf_size'] == out_drain_module[array]['buf_size'] and \
                       m['num'] == out_drain_module[array]['num']:
                       out_drain_module[array]['merged'] = 1

        # Latency prologue
        latency_prologue_items = []
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            for module in desp["latency"]:
                if desp["attr"][module]["in"] != 1:
                    continue
                if "inter" in module or "intra" in module:                    
                    # Keep all the latency AST under the mark.
                    info["valid"] = True
                    info["under_mark"] = None
                    info["in"] = 1
                else:
                    # Only keep the latency AST under the mark.
                    info["valid"] = False
                    info["under_mark"] = "array"
                    info["in"] = 1
                module_lat = desp["latency"][module]  
                info["name"] = module     
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            f.write(f"\t{module}_single_latency = ")                        
            f.write(info["modules"][module])
            f.write(f"\n")      
            latency_prologue_items.append(f"{module}_single_latency")
        f.write("\tlatency_prologue = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue 
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue           
            if not is_first:
                f.write(", ")
            f.write(f"{module}_single_latency")
            is_first = False
        f.write(")\n\n")

        # Latency epilogue
        latency_epilogue_items = []
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            for module in desp["latency"]:
                if desp["attr"][module]["in"] != 0:
                    continue
                if "inter" in module or "intra" in module:
                    info["valid"] = True
                    info["under_mark"] = None
                    info["in"] = 0
                else:
                    info["valid"] = False
                    info["under_mark"] = "array"
                    info["in"] = 0
                module_lat = desp["latency"][module]  
                info["name"] = module                
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)
        for module in info["modules"]:            
            if "inter" in module or "intra" in module:
                continue
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue
            f.write(f"\t{module}_single_latency = ")                        
            f.write(info["modules"][module])
            latency_epilogue_items.append(f"{module}_single_latency")
            f.write(f"\n")        
        cnt = 0
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue    
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                 
            cnt += 1
        if cnt == 1:
            f.write("\tlatency_epilogue = ")
        else:
            f.write("\tlatency_epilogue = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue    
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                    
            if not is_first:
                f.write(", ")
            f.write(f"{module}_single_latency")
            is_first = False
        if cnt == 1:            
            f.write("\n\n")
        else:
            f.write(")\n\n")

        # Latency main
        latency_main_items = []
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)            
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                  
            f.write(f"\t{module}_latency = ")                        
            f.write(info["modules"][module])
            f.write(f"\n")        
            latency_main_items.append(f"{module}_latency")
        f.write("\tlatency_main = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue   
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                      
            if not is_first:
                f.write(", ")
            f.write(f"{module}_latency")
            is_first = False
        f.write(")\n\n")

        #f.write("\tprint(latency_prologue, latency_main, latency_epilogue)\n\n")

        f.write("\tlatency = latency_prologue + latency_main + latency_epilogue\n\n")
        
        f.write("\t# Meta information, used for conv fusion only\n")
        f.write("\tlatency_meta = {\"latency_prologue\": {}, \"latency_main\": {}, \"latency_epilogue\": {}}\n")
        # Prologue        
        for item in latency_prologue_items:            
            f.write(f"\tlatency_meta[\"latency_prologue\"][\"{item}\"] = {item}\n")
        # Epilogue
        for item in latency_epilogue_items:            
            f.write(f"\tlatency_meta[\"latency_epilogue\"][\"{item}\"] = {item}\n")
        # Main
        for item in latency_main_items:            
            f.write(f"\tlatency_meta[\"latency_main\"][\"{item}\"] = {item}\n")

        f.write("\treturn latency, latency_meta\n")
        f.write("\n")

    def print_activity_est_func(self, f, desp):
        f.write("def est_activity(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        def extract_stmt_call_num_expr(lat, info):
            ret = ""
            if lat["type"] == "block":
                info["has_for_child"] = 0
                no_for_child = True
                is_first = True
                ret += "("
                for child in lat["child"]:
                    if not is_first:
                        ret += " + "                    
                    ret += extract_stmt_call_num_expr(child, info)                    
                    if info["has_for_child"] == 1:
                        no_for_child = False
                    is_first = False
                ret += ")"
                if no_for_child:
                    ret = "1"
            elif lat["type"] == "for":                
                child = lat["child"]
                expr = extract_stmt_call_num_expr(child, info)                
                #if not info["ignore_inter"]:
                #    if info["valid"]:
                #        ret = lat["bounds"][1] + " * " + expr
                #    else:
                #        ret = expr
                #else:
                #ret = expr
                ret = lat["bounds"][1] + " * " + expr
                info["has_for_child"] = 1
            elif lat["type"] == "mark":      
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = True
                if lat["content"] == "simd":
                    if info["valid"]:
                        ret = "1"
                    else:
                        ret = "0"
                else:
                    child = lat["child"]
                    ret = extract_stmt_call_num_expr(child, info)
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = False
            elif lat["type"] == "user":
                user_expr = lat["child"]["user_expr"]
                if 'inter_intra' in user_expr or 'intra_inter' in user_expr:                    
                    if user_expr[:-2].split(".")[-1] == "1":
                        double_buffer = 1
                    else:
                        double_buffer = 0                    
                    # Plug in submodule latency
                    if f"{info['name']}_inter" in info["modules"]:
                        inter_expr = info["modules"][f"{info['name']}_inter"]
                    else:
                        inter_expr = None
                    if f"{info['name']}_intra" in info["modules"]:
                        intra_expr = info["modules"][f"{info['name']}_intra"]
                    else:
                        intra_expr = None

                    if inter_expr and intra_expr:
                        if info["in"] == 1 or info["in"] == 0:
                            ret = inter_expr
                        else:
                            if info['target'] in ["on_chip_transfer_io"]:
                                ret = f"({inter_expr})"
                            elif info['target'] in ["on_chip_transfer_pe"]:
                                ret = f"({intra_expr})"
                            else:
                                ret = f"({inter_expr} + {intra_expr})"                            
                        info["has_for_child"] = 1
                    else:                        
                        ret = "1"                        
                    if not info["valid"]:
                        ret = "0"
                #elif "inter_trans" in user_expr:
                #    # Plug in submodule latency
                #    if f"{info['name']}_inter" in info["modules"]:
                #        ret = info["modules"][f"{info['name']}_inter"]
                #    else:
                #        ret = "1"
                #    if not info["valid"]:
                #        ret = "0"
                #elif "intra_trans" in user_expr:
                #    # Plug in submodule latency                    
                #    if f"{info['name']}_intra" in info["modules"]:
                #        ret = info["modules"][f"{info['name']}_intra"]
                #    else:
                #        ret = "1"
                #    if not info["valid"]:
                #        ret = "0"                
                else: 
                    if info["target"] in ["on_chip_transfer_pe", "on_chip_transfer_io", "pe_compute_op", "on_chip_acc"]:
                        ret = "0"
                    else:
                        ret = "1"
            elif lat["type"] == "if":
                # Only examine the first child
                child = lat["child"][0]
                ret = extract_stmt_call_num_expr(child, info)
            elif lat["type"] == "array_tile":           
                if info["target"] in ["on_chip_acc"]:
                    ret = "(" + lat["size"] + "/" + lat["data_pack_factor"] + ")"
                else:
                    ret = "(" + lat["size"] + ")"
            else:
                raise RuntimeError(f"Unsupported latency node type {lat['type']}")

            return ret
        
        # Merge drain modules if necessary
        out_module = {}
        out_drain_module = {}
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.endswith('_out'):
                item = {'buf_size': module_mem['buf_size'], 
                        'num': module_mem['num']}
                if module.find('drain') != -1:
                    item['merged'] = 0
                    out_drain_module[module_mem['array']] = item
                else:                    
                    if module_mem['array'] not in out_module:
                        out_module[module_mem['array']] = [item]
                    else:
                        out_module[module_mem['array']].append(item)
        for array in out_drain_module:
            if array in out_module:
                for m in out_module[array]:                
                    if m['buf_size'] == out_drain_module[array]['buf_size'] and \
                       m['num'] == out_drain_module[array]['num']:
                       out_drain_module[array]['merged'] = 1

        # Extract the off-chip access expression
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:                
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["target"] = "off_chip_acc"
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_stmt_call_num_expr(module_lat, info)

        f.write("\tactivity = {}\n")
        f.write("\tactivity[\"off_chip_acc_num_meta\"] = {}\n")
        # Off-chip access
        # outermost I/O module latency * data_pack_factor
        f.write("\toff_chip_acc_num = 0\n")
        for module in info["modules"]:
            if desp["attr"][module]["to_dram"] != 1:
                continue
            if "inter" in module or "intra" in module:
                continue
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                      
            f.write(f"\t{module}_off_chip_acc_num = ")
            f.write(info["modules"][module])
            f.write("\n")
            f.write(f"\tactivity[\"off_chip_acc_num_meta\"][\"{module}\"] = {module}_off_chip_acc_num\n")
            f.write(f"\toff_chip_acc_num += {module}_off_chip_acc_num\n")
        
        f.write("\tactivity[\"off_chip_acc_num\"] = off_chip_acc_num\n\n")

        # NOC access        
        # For each I/O group,
        # sum_{io_level}(#io_modules(level)*inter_latency*data_pack_factor_inter) + #pe_modules*intra_latency*data_pack_factor_intra
        # Extract the on-chip data transfer expression
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:                
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["target"] = "on_chip_transfer_io"
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_stmt_call_num_expr(module_lat, info)

        f.write("\tnoc_hop_num = 0\n")        
        for module in desp["io"]:                 
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                 
            f.write(f"\t{module}_io_noc_hop_num = (1 + {desp['io'][module]['dims'][-1]}) / 2\n")            
            f.write(f"\t{module}_io_noc_hop_num *= {info['modules'][module]}\n")
            if len(desp['io'][module]['dims']) > 1:
                for idx in range(len(desp['io'][module]['dims']) - 1):
                    f.write(f"\t{module}_io_noc_hop_num *= {desp['io'][module]['dims'][idx]}\n")
            f.write(f"\tnoc_hop_num += {module}_io_noc_hop_num\n")
            
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:                
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["target"] = "on_chip_transfer_pe"
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_stmt_call_num_expr(module_lat, info)
        for module in desp["io"]:
            if module.find('drain') != -1:
                array_name = module[:module.find("_drain_IO")]                
                if out_drain_module[array_name]['merged'] == 1:
                    continue                    
            if desp["attr"][module]["to_pe"]:                
                f.write(f"\t{module}_pe_noc_hop_num = {desp['compute']['PE']['num']}\n")
                f.write(f"\t{module}_pe_noc_hop_num *= {info['modules'][module]}\n")
                f.write(f"\t{module}_pe_noc_hop_num *= {desp['memory'][module]['data_pack_factor_intra']}\n")
                f.write(f"\tnoc_hop_num += {module}_pe_noc_hop_num\n")

        f.write("\tactivity[\"noc_hop_num\"] = noc_hop_num\n\n")
        
        # Computations
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:                
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["target"] = "pe_compute_op"                
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_stmt_call_num_expr(module_lat, info)
        
        # Compute operation
        # PE latency * simd
        f.write("\tcompute_stmt_call_num = 0\n")
        f.write(f"\tcompute_stmt_call_num = {desp['compute']['PE']['unroll_factor']}\n")        
        f.write(f"\tcompute_stmt_call_num *= {info['modules']['PE']}\n")
        f.write(f"\tcompute_stmt_call_num *= {desp['compute']['PE']['num']}\n")
        f.write("\tactivity[\"compute_stmt_call_num\"] = compute_stmt_call_num\n\n")

        # IO module access        
        # sum(inter latency * data_pack_factor_inter + intra latency * data_pack_factor_inter)
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:                
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["target"] = "on_chip_acc"
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_stmt_call_num_expr(module_lat, info)

        f.write("\tio_module_mem_acc_num = 0\n")
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            if "PE" in module:
                continue            
            f.write(f"\t{module}_mem_acc_num = {info['modules'][module]}\n")
            f.write(f"\t{module}_mem_acc_num *= {desp['memory'][module]['data_pack_factor_inter']}\n")
            f.write(f"\tio_module_mem_acc_num += {module}_mem_acc_num\n")
        
        f.write("\tactivity[\"io_module_mem_acc_num\"] = io_module_mem_acc_num\n\n")
        
        # PE module access
        # PE latency * simd * 4 (op1, op2, res(R), res(W))        
        f.write("\tpe_module_reg_acc_num = 0\n")
        f.write("\tpe_module_mem_acc_num = 0\n")
        if "PE" in desp["memory"]:
            f.write("\tpe_module_reg_acc_num = 2\n") # op1, op2
            f.write("\tpe_module_mem_acc_num = 2\n") # res(R), res(W)
        else:
            f.write("\tpe_module_reg_acc_num = 4\n") # op1, op2, res(R), res(W)
            f.write("\tpe_module_mem_acc_num = 0\n") #         
        f.write(f"\tpe_module_reg_acc_num *= {desp['compute']['PE']['unroll_factor']}\n")
        f.write(f"\tpe_module_reg_acc_num *= {info['modules']['PE']}\n")
        f.write(f"\tpe_module_reg_acc_num *= {desp['compute']['PE']['num']}\n")
        f.write(f"\tpe_module_mem_acc_num *= {desp['compute']['PE']['unroll_factor']}\n")
        f.write(f"\tpe_module_mem_acc_num *= {info['modules']['PE']}\n")
        f.write(f"\tpe_module_mem_acc_num *= {desp['compute']['PE']['num']}\n")
        f.write("\tactivity[\"pe_module_reg_acc_num\"] = pe_module_reg_acc_num\n")
        f.write("\tactivity[\"pe_module_mem_acc_num\"] = pe_module_mem_acc_num\n\n")

        f.write("\treturn activity\n")
        f.write("\n")
        
    def print_infer_params_func(self, f, desp):
        f.write("def infer_params(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                continue
            if not is_first:
                f.write(", ")            
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                continue
            if not is_first:
                f.write(", ")            
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                f.write(f"\t{p['name']}_choices = [n*{p['bounds'][0]} for n in range(1, {p['bounds'][1]}//{p['bounds'][0]}+1) if {p['bounds'][1]}%(n*{p['bounds'][0]})==0]\n")
                f.write(f"\tif len({p['name']}_choices) == 0:\n")
                f.write(f"\t\treturn None\n")
                f.write(f"\tparams[\"{p['name']}\"] = max({p['name']}_choices)\n")
        f.write("\n")                
        f.write("\treturn params\n\n")

    def print_random_sampling_func(self, f, desp):
        f.write("def random_sampling(params):\n")
        f.write(f"\tdef filter_non_power_of_two(x):\n")
        f.write(f"\t\tif np.log2(x) != int(np.log2(x)):\n")
        f.write(f"\t\t\treturn True\n")
        f.write(f"\t\treturn False\n\n")
        # Print the task params
        for p in self.params_config["external"]:
            f.write(f"\t{p} = params[\"{p}\"]\n")
        f.write("\twhile True:\n")
        params_to_process = []
        for param in self.params_config["tunable"]:
            params_to_process.append(self.params_config["tunable"][param])
        #while len(params_to_process) > 0:            
        while True:
            update = False
            for param in params_to_process:
                if "divisors" not in param: 
                    #print("first ", param["name"])                   
                    f.write(f"\t\tsample = random.randint(int({param['bounds'][0]}), int({param['bounds'][1]}))\n")
                    f.write(f"\t\t{param['name']} = sample\n")
                    f.write(f"\t\tparams[\"{param['name']}\"] = sample\n")
                    params_to_process.remove(param)
                    update = True
            if not update:
                break
        while len(params_to_process) > 0:            
            for param in params_to_process:                
                if "divisors" in param and param["divisors"] not in params_to_process:                    
                    #print("second ", param["name"])
                    if "tags" in param and "power_of_two" in param["tags"]:
                        f.write(f"\t\tsample = random.sample(utils.get_divisors(int({param['bounds'][1]}), filter_non_power_of_two), 1)[-1]\n")
                    else:
                        f.write(f"\t\tsample = random.sample(utils.get_divisors(int({param['bounds'][1]}), None), 1)[-1]\n")
                    f.write(f"\t\t{param['name']} = sample\n")
                    f.write(f"\t\tparams[\"{param['name']}\"] = sample\n")
                    params_to_process.remove(param)
        # Latency hiding
        if "PE" not in desp["memory"]:        
            f.write(f"\t\tbreak\n")
        else:
            f.write(f"\t\tlatency_factors = 1\n")
            for p, param in self.params_config["tunable"].items():
                if param["attr"] == "latency_tiling_factor":
                    f.write(f"\t\tlatency_factors *= {param['name']}\n")
                if param["attr"] == "SIMD_tiling_factor":
                    f.write(f"\t\tsimd_factor = {param['name']}\n")
            data_type = desp["memory"]["PE"]["ele_type"]
            if data_type == "float":
                f.write(f"\t\tif latency_factors >= 8 * simd_factor:\n")
                f.write(f"\t\t\tbreak\n")
            else:
                raise RuntimeError(f"Unsupported data type in random sample generation: {data_type}")
        f.write("\n")                
        f.write("\treturn params\n\n")        

    def print_bound_check_func(self, f, desp):
        f.write("def bound_check(params):\n")
        f.write(f"\tdef filter_non_power_of_two(x):\n")
        f.write(f"\t\tif np.log2(x) != int(np.log2(x)):\n")
        f.write(f"\t\t\treturn True\n")
        f.write(f"\t\treturn False\n\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")
        for p in desp["params"]:
            if "bounds" in p:
                f.write(f"\tif {p['name']} < {p['bounds'][0]}:\n")
                f.write(f"\t\treturn False\n")
                # If the parameter is the first-level tiling factors, 
                # ignore the upper bounds.
                if not p['name'].endswith('t1'):
                    f.write(f"\tif {p['name']} > {p['bounds'][1]}:\n")
                    f.write(f"\t\treturn False\n")
            if "tags" in p and "power_of_two" in p["tags"]:
                f.write(f"\tif filter_non_power_of_two({p['name']}):\n")
                f.write(f"\t\treturn False\n")
        # Latency hiding
        if "PE" in desp["memory"]:
            f.write(f"\tlatency_factors = 1\n")
            for p, param in self.params_config["tunable"].items():
                if param["attr"] == "latency_tiling_factor":
                    f.write(f"\tlatency_factors *= {param['name']}\n")
                if param["attr"] == "SIMD_tiling_factor":
                    f.write(f"\tsimd_factor = {param['name']}\n")
            data_type = desp["memory"]["PE"]["ele_type"]
            if data_type == "float":
                f.write(f"\tif latency_factors < 8 * simd_factor:\n")
                f.write(f"\t\treturn False\n")
            else:
                raise RuntimeError(f"Unsupported data type in random sample generation: {data_type}")
        
        f.write("\treturn True\n\n")        

    def print_compute_arch_cst_func(self, f, desp):
        f.write("def compute_arch_cst(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        f.write("\tarch_features = {}\n")
        
        # Compute basic architecture information                
        f.write(f"\tarch_features['dims'] = []\n")
        for dim in desp['compute']['PE']['dims']:
            f.write(f"\tarch_features[\"dims\"].append({dim})\n")
            f.write(f"\tif arch_features[\"dims\"][-1] == 0:\n")        
            f.write(f"\t\treturn None\n")
        f.write(f"\tarch_features[\"SIMD\"] = {desp['compute']['PE']['unroll_factor']}\n")

        # data packing factors
        f.write("\tarch_features[\"data_pack\"] = {}\n")
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if 'data_pack_factor' in module_mem:
                f.write(f"\tarch_features[\"data_pack\"][\"{module_mem['array']}\"] = [{module_mem['data_pack_factor']}]\n")

        f.write("\n\treturn arch_features\n\n")

    def register(self, desp, py_f):
        """ Register the design in the descriptor file
        Generate all the necessary functions for evaluating the performance of the 
        target design.         
        """        
        # Tuning parameters            
        self.params_config = {"external": {}, "tunable": {}, "infer": {}}
        for param in desp["params"]:
            if param["tunable"]:
                self.params_config["tunable"][param["name"]] = param
            else:
                if "external" in param["tags"]:
                    self.params_config["external"][param["name"]] = param
                elif "auto_infer" in param["tags"]:
                    self.params_config["infer"][param["name"]] = param
        
        # Print design function            
        with open(py_f, 'w') as f:
            f.write("from math import ceil\n")
            f.write("import numpy as np\n")
            f.write("import random\n")
            f.write("import utils\n\n")

            # Generate resource est func        
            self.print_resource_est_func(f, desp)

            # Generate latency est func
            self.print_latency_est_func(f, desp)            
        
            # Generate activity est func
            self.print_activity_est_func(f, desp)

            # Generate infer parameter func
            self.print_infer_params_func(f, desp)

            # Generate the random sampling func
            self.print_random_sampling_func(f, desp)

            # Generate the bound check func
            self.print_bound_check_func(f, desp)

            # Generate the compute arch cst func
            self.print_compute_arch_cst_func(f, desp)                

        sys.path.append(os.path.dirname(py_f))
        basename = os.path.basename(py_f).split(".")[0]        
        module = __import__(basename)
        self.est_resource_func = module.est_resource
        self.est_latency_func = module.est_latency
        self.est_activity_func = module.est_activity
        self.infer_params_func = module.infer_params
        self.random_sampling_func = module.random_sampling
        self.bound_check_func = module.bound_check
        self.compute_arch_cst_func = module.compute_arch_cst
        self.desp = desp

    def est_latency(self, params):
        if not self.est_latency_func:
            raise RuntimeError(f"Latency estimation function for design {self.name} undefined")
        else:
            return self.est_latency_func(params)
    
    def est_resource(self, params):
        if not self.est_latency_func:
            raise RuntimeError(f"Resource estimation function for design {self.name} undefined")
        else:
            return self.est_resource_func(params)

    def est_activity(self, params):
        if not self.est_activity_func:
            raise RuntimeError(f"Activity estimation function for design {self.name} undefined")
        else:
            return self.est_activity_func(params)

    def infer_params(self, params):
        if not self.infer_params_func:
            raise RuntimeError(f"Internal parameter inference function for design {self.name} undefined")
        else:
            return self.infer_params_func(params)

    def random_sampling(self, params):
        if not self.random_sampling_func:
            raise RuntimeError(f"Random sampling function for design {self.name} undefined")
        else:
            return self.random_sampling_func(params)

    def bound_check(self, params):
        if not self.bound_check_func:
            raise RuntimeError(f"Bound check function for design {self.name} undefined")
        else:
            return self.bound_check_func(params)            

    def compute_arch_cst(self, params):
        if not self.compute_arch_cst_func:
            raise RuntimeError(f"Compute architecture constraints function for design {self.name} undefined")
        else:
            params = self.infer_params(params)
            if params:
                arch_cst = self.compute_arch_cst_func(params)
                res = self.est_resource(params)
                arch_cst['res_usage'] = res
                return arch_cst
            else:
                return None

================================================
FILE: autosa_scripts/odyssey/designs/kernel3.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p9",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "i_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p10",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "j_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, c1, 1, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c1 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t2*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "i_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(c0, c1, 1, p0, p1, 15, c6, c7, 2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c1 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel0_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)"
            ],
            "ele_type": "float",
            "num": "(o_t1/o_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, 4 * c1 + c5, 4 * c2 + c6, 2 * p0 + c7 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(c_t1/c_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "r_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "o_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(2 * p0 + 8 * c3 + c12, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, 4 * c1 + c5, 4 * c2 + c6, 2 * p0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p14",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(1, c1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c3 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(0, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c3 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c3 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel0_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)"
            ],
            "ele_type": "float",
            "num": "(o_t1/o_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "child": {
                "child": {
                    "child": {
                        "data_pack_factor": "p14",
                        "ele_size": 4,
                        "last_dim": "i_t1",
                        "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                        "type": "array_tile"
                    },
                    "content": "access_serialize",
                    "type": "mark"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c5 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t2",
                                                "size": "r_t1*c_t1*o_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel0_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)"
            ],
            "ele_type": "float",
            "num": "(o_t1/o_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, c5 + 4, 4 * c2 + c6, 2 * p0 + 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(c_t1/c_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "r_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "o_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c5 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c2 + 2 * c5 + c11, 4 * c3 + 2 * c6 + c10, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c2 + 2 * c5 + c11, 4 * c3 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c5 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, c5, 4 * c2 + c6, 2 * p0 + 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p14",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c5 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(c0, 1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c5 + c11, 4 * c3 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(c0, 0, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c5 + c11, 4 * c3 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c5 + c11, 4 * c3 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(o_t1/o_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "i_t1",
                                        "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c6",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel1_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "(r_t1/r_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, 2 * p0 + 4 * c1 + c5, 4 * c2 + c6, c7 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(c_t1/c_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "r_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(8 * c3 + 2 * c5 + c11, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c5 + c11)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, 2 * p0 + 4 * c1 + c5, 4 * c2 + c6, c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t1",
                                            "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(1, c1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(0, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p17",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel1_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "(r_t1/r_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "o_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "r_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(8 * c0 + 2 * c5 + c11, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t2*c_t1*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "child": {
                        "data_pack_factor": "p17",
                        "ele_size": 4,
                        "last_dim": "i_t1",
                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                        "type": "array_tile"
                    },
                    "content": "access_serialize",
                    "type": "mark"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel1_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "(r_t1/r_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, 2 * p0 + c5 + 4, 4 * c2 + c6, 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(c_t1/c_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "r_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c2 + c8 + c12, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(8 * c0 + 2 * c5 + c11, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c6 + c10, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, 2 * p0 + c5, 4 * c2 + c6, 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c2 + c8 + c12, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(c0, 1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(c0, 0, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c6 + c10, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p17",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel2_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "(c_t1/c_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, 4 * c1 + c5, 2 * p0 + 4 * c2 + c6, c7 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "c_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(8 * c3 + 2 * c5 + c11, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c5 + c11)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, 4 * c1 + c5, 2 * p0 + 4 * c2 + c6, c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 8 * c0 + 2 * c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t1",
                                            "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(1, c1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(0, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p17",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c5 + c11, c8, c9, 8 * c0 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel2_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "(c_t1/c_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "o_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "c_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(8 * c0 + 2 * c5 + c11, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c5 + c11)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, c1, c2, 1, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t1*c_t2*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "child": {
                        "data_pack_factor": "p17",
                        "ele_size": 4,
                        "last_dim": "i_t1",
                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                        "type": "array_tile"
                    },
                    "content": "access_serialize",
                    "type": "mark"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c3 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel2_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "(c_t1/c_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, c5 + 4, 2 * p0 + 4 * c2 + c6, 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "(i_t1/i_t2)"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "p"
                                                        ],
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "q"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "bounds": [
                                                                                        "0",
                                                                                        "c_t2"
                                                                                    ],
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c3 + c9 + c12, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "bounds": [
                                                                                                            "0",
                                                                                                            "i_t2"
                                                                                                        ],
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "child": {
                                                                                                                    "user_expr": "S_0(8 * c0 + 2 * c5 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c12, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                                },
                                                                                                                "type": "user"
                                                                                                            },
                                                                                                            "content": "hls_unroll",
                                                                                                            "type": "mark"
                                                                                                        },
                                                                                                        "iterator": "c13",
                                                                                                        "type": "for"
                                                                                                    },
                                                                                                    "content": "simd",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                {
                                                                                                    "child": [
                                                                                                        {
                                                                                                            "child": {
                                                                                                                "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c5 + c11)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        }
                                                                                                    ],
                                                                                                    "type": "if"
                                                                                                }
                                                                                            ],
                                                                                            "type": "block"
                                                                                        },
                                                                                        "content": "hls_pipeline",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    "iterator": "c10",
                                                                                    "type": "for"
                                                                                },
                                                                                "content": "latency",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c11",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c12",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c0",
                                                            "type": "for"
                                                        },
                                                        "iterator": "c1",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c9",
                                                    "type": "for"
                                                },
                                                "iterator": "c8",
                                                "type": "for"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, c5, 2 * p0 + 4 * c2 + c6, 8 * c0 + c7)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c3 + c9 + c12, 8 * c1 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(c0, 1, c2, c3, p0, c5, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(c0, 0, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p15",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, p0, c5, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c5 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.1.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p17",
                    "ele_size": 4,
                    "last_dim": "i_t1",
                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "p"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "q"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, 0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c11, c8, c9, 8 * c1 + 2 * c7)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c0",
                                        "type": "for"
                                    },
                                    "iterator": "c1",
                                    "type": "for"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel3_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c0)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c0)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(8 * c3 + 2 * c5 + c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c0)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t2",
                                            "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p16",
                    "ele_size": 4,
                    "last_dim": "o_t1",
                    "size": "r_t1*c_t1*o_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(1, c1, c2, c3, 0, c5, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p16",
                    "ele_size": 4,
                    "last_dim": "o_t1",
                    "size": "r_t1*c_t1*o_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(0, c1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "child": {
                "child": [
                    {
                        "child": {
                            "data_pack_factor": "p17",
                            "ele_size": 4,
                            "last_dim": "o_t1",
                            "size": "r_t1*c_t1*o_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    },
                    {
                        "child": {
                            "data_pack_factor": "p17",
                            "ele_size": 4,
                            "last_dim": "o_t1",
                            "size": "r_t1*c_t1*o_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    }
                ],
                "type": "if"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c5 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t1*c_t1*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c0)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p18",
                                                "ele_size": 4,
                                                "last_dim": "i_t2",
                                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel3_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c3)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c3)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(8 * c0 + 2 * c5 + c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c3)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t2",
                                                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.1.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "data_pack_factor": "p16",
                            "ele_size": 4,
                            "last_dim": "o_t1",
                            "size": "r_t1*c_t1*o_t1",
                            "type": "array_tile"
                        },
                        "content": "access_serialize",
                        "type": "mark"
                    },
                    "content": "access_coalesce",
                    "type": "mark"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "in_trans_reduce_+.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, c1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c3)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t2",
                                                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel3_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": [
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c11, 4 * c3 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c1)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 0, 0, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c1)"
                                                                                            },
                                                                                            "type": "user"
                                                                                        },
                                                                                        {
                                                                                            "child": {
                                                                                                "bounds": [
                                                                                                    "0",
                                                                                                    "i_t2"
                                                                                                ],
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "user_expr": "S_0(8 * c0 + 2 * c5 + c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c1 + c13, c8, c9)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    },
                                                                                                    "content": "hls_unroll",
                                                                                                    "type": "mark"
                                                                                                },
                                                                                                "iterator": "c13",
                                                                                                "type": "for"
                                                                                            },
                                                                                            "content": "simd",
                                                                                            "type": "mark"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        },
                                                                                        {
                                                                                            "child": [
                                                                                                {
                                                                                                    "child": {
                                                                                                        "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, c5, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                }
                                                                                            ],
                                                                                            "type": "if"
                                                                                        }
                                                                                    ],
                                                                                    "type": "block"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c8 + c11, 4 * c3 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c1)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t2",
                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p16",
                    "ele_size": 4,
                    "last_dim": "o_t1",
                    "size": "r_t1*c_t1*o_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(c0, 1, c2, c3, 0, c5, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p16",
                    "ele_size": 4,
                    "last_dim": "o_t1",
                    "size": "r_t1*c_t1*o_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, 0, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c9",
                                "type": "for"
                            },
                            "iterator": "c8",
                            "type": "for"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "child": {
                "child": [
                    {
                        "child": {
                            "data_pack_factor": "p17",
                            "ele_size": 4,
                            "last_dim": "o_t1",
                            "size": "r_t1*c_t1*o_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    },
                    {
                        "child": {
                            "data_pack_factor": "p17",
                            "ele_size": 4,
                            "last_dim": "o_t1",
                            "size": "r_t1*c_t1*o_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    }
                ],
                "type": "if"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, 3, c5, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c5 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "iterator": "c7",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t1*c_t1*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, c5, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c5 + c12, c8, c9, 2 * p0 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p18",
                                        "ele_size": 4,
                                        "last_dim": "i_t2",
                                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c6",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel4_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, p1, 2 * p1 + 4 * c1 + c6, 4 * c2 + c7, 2 * p0 + c8 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "o_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(2 * p0 + 8 * c3 + c12, 2 * p1 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p1 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, p1, 2 * p1 + 4 * c1 + c6, 4 * c2 + c7, 2 * p0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c0 + 2 * c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t1",
                                            "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(1, c1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(0, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel4_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 2 * p1 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p1 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c8 + c11, 4 * c2 + 2 * c6 + c9 + c10, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c11, 4 * c2 + 2 * c6 + c10, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p16",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t2*c_t1*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_serialize",
                                                    "type": "mark"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel4_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(r_t1/r_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(r_t1/r_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, p1, 2 * p1 + c6 + 4, 4 * c2 + c7, 2 * p0 + 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "o_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c2 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 2 * p1 + 4 * c2 + c11, 4 * c3 + 2 * c6 + c10, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p1 + 4 * c2 + c11, 4 * c3 + 2 * c6 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p1 + 4 * c2 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, p1, 2 * p1 + c6, 4 * c2 + c7, 2 * p0 + 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 4 * c2 + c8 + c11, 4 * c3 + 2 * c6 + c9 + c10, 8 * c1 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(c0, 1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c2 + c11, 4 * c3 + 2 * c6 + c10, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(c0, 0, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c2 + c11, 4 * c3 + 2 * c6 + c10, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t2*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "c_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c2 + c11, 4 * c3 + 2 * c6 + c10, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t2*c_t1*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t2*c_t1*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "r_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "content": "pe",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(o_t1/o_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "i_t1",
                                        "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c6",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(r_t1/r_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel5_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, p1, 4 * c1 + c6, 2 * p1 + 4 * c2 + c7, 2 * p0 + c8 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "o_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c11, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(2 * p0 + 8 * c3 + c12, 4 * c1 + 2 * c6 + c10, 2 * p1 + 4 * c2 + c11, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c10, 2 * p1 + 4 * c2 + c11, 2 * p0 + 8 * c3 + c12)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c11, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, p1, 4 * c1 + c6, 2 * p1 + 4 * c2 + c7, 2 * p0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c11, 8 * c0 + 2 * c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t1",
                                            "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(1, c1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c11, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(0, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c11, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c11, 2 * p1 + 8 * c3 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c3 + c12, c8, c9, 8 * c0 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel5_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c11, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c1 + 2 * c6 + c10, 2 * p1 + 4 * c2 + c11, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c10, 2 * p1 + 4 * c2 + c11, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c11, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c11, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c11, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p16",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t1*c_t2*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_serialize",
                                                    "type": "mark"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel5_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, p1, c6 + 4, 2 * p1 + 4 * c2 + c7, 2 * p0 + 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "o_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c3 + c9 + c11, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c2 + 2 * c6 + c10, 2 * p1 + 4 * c3 + c11, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c10, 2 * p1 + 4 * c3 + c11, 2 * p0 + 8 * c0 + c12)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c10, 2 * p1 + 4 * c3 + c9 + c11, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, p1, c6, 2 * p1 + 4 * c2 + c7, 2 * p0 + 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c8 + c10, 2 * p0 + 4 * c3 + c9 + c11, 8 * c1 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.2.1(c0, 1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c11, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.2.1(c0, 0, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c11, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t2*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "r_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c10, 2 * p0 + 4 * c3 + c11, 2 * p1 + 8 * c0 + c12)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(o_t1/o_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t2",
                                                            "size": "r_t1*c_t2*o_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t2",
                                                    "size": "r_t1*c_t2*o_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "o_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p0 + 8 * c0 + c12, c8, c9, 8 * c1 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "content": "pe",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(o_t1/o_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "i_t1",
                                        "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c6",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(o_t1/o_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel6_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c3 + c12, c8, c9, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(2 * p0 + 8 * c3 + c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p1 + 8 * c0 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c0)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_pipeline",
                                                                                "type": "mark"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p14",
                                            "ele_size": 4,
                                            "last_dim": "i_t2",
                                            "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.2.1(1, c1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.2.1(0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c3 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((o/o_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c2",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(1, c1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p1 + 8 * c3 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t1*c_t1*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t1*c_t1*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t2",
                                                "size": "r_t1*c_t1*o_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 8 * c3 + c12, c8, c9, 2 * p0 + 8 * c0)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p18",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p18",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p18",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(o_t1/o_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,16),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel6_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p1 + 8 * c3 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c6 + c8 + c11, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c3)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p14",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t2",
                                                    "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t2",
                                        "size": "r_t1*c_t1*o_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t2",
                                        "size": "r_t1*c_t1*o_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                }
                            ],
                            "type": "if"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "iterator": "c6",
                    "type": "for"
                },
                "content": "io_L3",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "in_trans_reduce_+.fifo_cout_1_local.fifo_cout_1.1.2.1(c0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t2",
                                                "size": "r_t1*c_t1*o_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 8 * c0 + c12, c8, c9, 2 * p0 + 8 * c3)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p17",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p17",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p17",
                                                            "ele_size": 4,
                                                            "last_dim": "i_t2",
                                                            "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_serialize",
                                                        "type": "mark"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(o_t1/o_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,16),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel6_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(o_t1/o_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(o_t1/o_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(o_t1/o_t2)"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "o_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c11, 4 * c3 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 8 * c0 + c12, c8, c9, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(2 * p0 + 8 * c0 + c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p1 + 8 * c1 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c6 + c8 + c11, 4 * c3 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "cin_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c8 + c11, 4 * c3 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c1)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p14",
                                                "ele_size": 4,
                                                "last_dim": "i_t2",
                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.2.1(c0, 1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.2.1(c0, 0, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p0 + 8 * c0 + c12)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(o_t1/o_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t2",
                                            "size": "r_t1*c_t1*o_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((r/r_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((c/c_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "iterator": "c3",
                        "type": "for"
                    },
                    "iterator": "c2",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t2",
                                "size": "r_t1*c_t1*o_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.2.1(c0, 1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c6 + c11, 4 * c3 + 2 * c7 + c10, 2 * p1 + 8 * c0 + c12)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t1*c_t1*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t2",
                                                        "size": "r_t1*c_t1*o_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t2",
                                                "size": "r_t1*c_t1*o_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(o_t1/o_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "w_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 8 * c0 + c12, c8, c9, 2 * p0 + 8 * c1)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p18",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(o_t1/o_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p18",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p18",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "o_t2*((p-1)+1)*((q-1)+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((o_t1/o_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L2_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t1)*o_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(o_t1/o_t2)"
        },
        "w_IO_L1_in": {
            "array": "w",
            "buf_size": "(((o_t2*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(o_t1/o_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,16),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t2,4),1)"
            ],
            "divisors": [
                "o_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel7_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(1, c1, c2, c3, p0, p1, 2 * p0 + 4 * c1 + c6, 2 * p1 + 4 * c2 + c7, c8 + 8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "r_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 2 * p1 + 4 * c2 + c9 + c11, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c10, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(8 * c3 + 2 * c6 + c10, 2 * p0 + 4 * c1 + c12, 2 * p1 + 4 * c2 + c11, 8 * c0 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c10, c8, c9, 8 * c0 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 2 * p1 + 4 * c2 + c11, 8 * c3 + 2 * c6 + c10)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(0, c1, c2, c3, p0, p1, 2 * p0 + 4 * c1 + c6, 2 * p1 + 4 * c2 + c7, c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c1 + c8 + c12, 2 * p0 + 4 * c2 + c9 + c11, 8 * c0 + 2 * c7)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t1",
                                                        "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(1, c1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p1 + 4 * c1 + c12, 2 * p0 + 4 * c2 + c11, 8 * c3 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(0, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c1 + c12, 2 * p0 + 4 * c2 + c11, 8 * c3 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c1 + c12, 2 * p0 + 4 * c2 + c11, 8 * c3 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c6 + c10, c8, c9, 8 * c0 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "i_t1",
                                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel7_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 2 * p1 + 4 * c2 + c9 + c11, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c0 + 2 * c6 + c10, 2 * p0 + 4 * c1 + c12, 2 * p1 + 4 * c2 + c11, 8 * c3 + 2 * c7 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c3 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 2 * p1 + 4 * c2 + c11, 8 * c0 + 2 * c6 + c10)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c1 + c8 + c12, 2 * p0 + 4 * c2 + c9 + c11, 8 * c3 + 2 * c7)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p14",
                                                            "ele_size": 4,
                                                            "last_dim": "i_t1",
                                                            "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_serialize",
                                                        "type": "mark"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, c1, c2, 1, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c1 + c12, 2 * p0 + 4 * c2 + c11, 8 * c0 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p16",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t2*c_t2*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_serialize",
                                                    "type": "mark"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c3 + 2 * c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t1",
                                                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel7_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(c_t1/c_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L1_in": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(c_t1/c_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in.fifo_cout.1.1(c0, 1, c2, c3, p0, p1, 2 * p0 + c6 + 4, 2 * p1 + 4 * c2 + c7, 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(o_t1/o_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "p"
                                                    ],
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "q"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "r_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c2 + c8 + c12, 2 * p1 + 4 * c3 + c9 + c11, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "bounds": [
                                                                                                        "0",
                                                                                                        "i_t2"
                                                                                                    ],
                                                                                                    "child": {
                                                                                                        "child": {
                                                                                                            "child": {
                                                                                                                "user_expr": "S_0(8 * c0 + 2 * c6 + c10, 2 * p0 + 4 * c2 + c12, 2 * p1 + 4 * c3 + c11, 8 * c1 + 2 * c7 + c13, c8, c9)"
                                                                                                            },
                                                                                                            "type": "user"
                                                                                                        },
                                                                                                        "content": "hls_unroll",
                                                                                                        "type": "mark"
                                                                                                    },
                                                                                                    "iterator": "c13",
                                                                                                    "type": "for"
                                                                                                },
                                                                                                "content": "simd",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c1 + 2 * c7)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            },
                                                                                            {
                                                                                                "child": [
                                                                                                    {
                                                                                                        "child": {
                                                                                                            "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 2 * p0 + 4 * c2 + c12, 2 * p1 + 4 * c3 + c11, 8 * c0 + 2 * c6 + c10)"
                                                                                                        },
                                                                                                        "type": "user"
                                                                                                    }
                                                                                                ],
                                                                                                "type": "if"
                                                                                            }
                                                                                        ],
                                                                                        "type": "block"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c10",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "latency",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c11",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c12",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c0",
                                                        "type": "for"
                                                    },
                                                    "iterator": "c1",
                                                    "type": "for"
                                                },
                                                "iterator": "c9",
                                                "type": "for"
                                            },
                                            "iterator": "c8",
                                            "type": "for"
                                        },
                                        {
                                            "child": [
                                                {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out.fifo_cout.1.1(c0, 0, c2, c3, p0, p1, 2 * p0 + c6, 2 * p1 + 4 * c2 + c7, 8 * c0 + c8)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                }
                                            ],
                                            "type": "if"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.4.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c2 + c8 + c12, 2 * p0 + 4 * c3 + c9 + c11, 8 * c1 + 2 * c7)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t1",
                                                                "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t1",
                                                        "size": "(((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_cout.fifo_cout_local.1.4.1(c0, 1, c2, c3, p0, p1, c6, 0, 0, 0, c10, c11, c12, 0, 2 * p1 + 4 * c2 + c12, 2 * p0 + 4 * c3 + c11, 8 * c0 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p15",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_local.fifo_cout.1.4.1(c0, 0, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c2 + c12, 2 * p0 + 4 * c3 + c11, 8 * c0 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p15",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p15",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.0()"
                                    },
                                    "type": "user"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "o_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, p0, p1, c6, 3, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c2 + c12, 2 * p0 + 4 * c3 + c11, 8 * c0 + 2 * c6 + c10)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c10",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c11",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c12",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(r_t1/r_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p16",
                                                            "ele_size": 4,
                                                            "last_dim": "o_t1",
                                                            "size": "r_t2*c_t2*o_t1",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p16",
                                                    "ele_size": 4,
                                                    "last_dim": "o_t1",
                                                    "size": "r_t2*c_t2*o_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t1",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "o_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "c_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "r_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.8.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c10, c8, c9, 8 * c1 + 2 * c7)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "content": "pe",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p17",
                                        "ele_size": 4,
                                        "last_dim": "i_t1",
                                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(c_t1/c_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t1)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_IO_L1_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p15",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(r_t1/r_t2))"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,4),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t1,16),i_t2)"
            ],
            "divisors": [
                "i_t1"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel8_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c3 + 2 * c6 + c11, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 2 * p1 + 8 * c0 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c0)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(1, c1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((o/o_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c2",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c3 + 2 * c6 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t2*c_t1*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t2*c_t1*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t2*c_t1*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c0)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p18",
                                                "ele_size": 4,
                                                "last_dim": "i_t2",
                                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(r_t1/r_t2))"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel8_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c0 + 2 * c6 + c11, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 2 * p1 + 8 * c3 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c1 + c8 + c12, 4 * c2 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c3)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p14",
                                                            "ele_size": 4,
                                                            "last_dim": "i_t2",
                                                            "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_serialize",
                                                        "type": "mark"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(r_t1/r_t2)"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t2*c_t1*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t2*c_t1*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                }
                            ],
                            "type": "if"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "iterator": "c6",
                    "type": "for"
                },
                "content": "io_L3",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "r_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "in_trans_reduce_+.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c1 + c12, 4 * c2 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t2*c_t1*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "c_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c3)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t2",
                                                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(r_t1/r_t2))"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel8_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(r_t1/r_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(r_t1/r_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(r_t1/r_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(c_t1/c_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "r_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 2 * p0 + 4 * c2 + c8 + c12, 4 * c3 + 2 * c7 + c9 + c10, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c0 + 2 * c6 + c11, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 2 * p1 + 8 * c1 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "c_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 2 * p1 + 4 * c2 + c8 + c12, 4 * c3 + 2 * c7 + c9 + c10, 2 * p0 + 8 * c1)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(r_t1/r_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "(((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(c0, 1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(c_t1/c_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "c_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "r_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, 0, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 2 * p0 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(r_t1/r_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t2*c_t1*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((r/r_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((c/c_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "iterator": "c3",
                        "type": "for"
                    },
                    "iterator": "c2",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(r_t1/r_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t2*c_t1*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(c_t1/c_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "c_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 2 * p1 + 4 * c2 + c12, 4 * c3 + 2 * c7 + c10, 8 * c0 + 2 * c6 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t2*c_t1*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t2*c_t1*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(r_t1/r_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t2*c_t1*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "o_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "r_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "content": "pe",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p18",
                                        "ele_size": 4,
                                        "last_dim": "i_t2",
                                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((r_t1/r_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "(((((r_t2-1)+(p-1))+1)*((((((c_t1/c_t2)-1)*c_t2)+(c_t2-1))+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(r_t1/r_t2))"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t2*c_t1)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(r_t1/r_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel9_0.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c7 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c3 + 2 * c6 + c11, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 2 * p1 + 8 * c0 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c0)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(1, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c7 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c12, 2 * p0 + 8 * c0)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(1, c1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((o/o_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c2",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(1, c1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c7 + c10, 2 * p1 + 4 * c2 + c12, 8 * c3 + 2 * c6 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t1*c_t2*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t1*c_t2*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t1*c_t2*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c3 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c0)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((o/o_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p18",
                                                "ele_size": 4,
                                                "last_dim": "i_t2",
                                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c2",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c5",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(c_t1/c_t2))"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel9_1.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c1 + 2 * c7 + c8 + c10, 2 * p0 + 4 * c2 + c9 + c12, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c0 + 2 * c6 + c11, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 2 * p1 + 8 * c3 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c3)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c1 + 2 * c7 + c8 + c10, 2 * p1 + 4 * c2 + c9 + c12, 2 * p0 + 8 * c3)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p14",
                                                            "ele_size": 4,
                                                            "last_dim": "i_t2",
                                                            "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_serialize",
                                                        "type": "mark"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(c_t1/c_t2)"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t1*c_t2*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                {
                                    "child": {
                                        "data_pack_factor": "p16",
                                        "ele_size": 4,
                                        "last_dim": "o_t1",
                                        "size": "r_t1*c_t2*o_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                }
                            ],
                            "type": "if"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "iterator": "c6",
                    "type": "for"
                },
                "content": "io_L3",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "cout_1_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "r_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "o_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "c_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "in_trans_reduce_+.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, c1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c1 + 2 * c7 + c10, 2 * p0 + 4 * c2 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_pipeline",
                                                                        "type": "mark"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c10",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c11",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c12",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c5",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p16",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t1*c_t2*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "p"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "q"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "r_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "o_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "c_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "child": {
                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c3)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    "content": "hls_pipeline",
                                                                    "type": "mark"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c10",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c11",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c12",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c0",
                                    "type": "for"
                                },
                                "iterator": "c1",
                                "type": "for"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p17",
                                                    "ele_size": 4,
                                                    "last_dim": "i_t2",
                                                    "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(c_t1/c_t2))"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/cnn/kernel9_2.json
================================================
{
    "attr": {
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cin_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cin_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_1_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_1_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "cout_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "cout_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "cout_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "w_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "w_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(c_t1/c_t2)",
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))",
            "unroll_factor": "i_t2"
        }
    },
    "io": {
        "cin_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)",
                "(c_t1/c_t2)"
            ]
        },
        "cin_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cin_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L2_in": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_1_IO_L2_out": {
            "dims": [
                "(c_t1/c_t2)"
            ]
        },
        "cout_1_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "cout_1_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "cout_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)",
                "(c_t1/c_t2)"
            ]
        },
        "cout_drain_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "cout_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "w_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "w_IO_L3_in": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "PE": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(o_t1/o_t2)"
                                    ],
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(r_t1/r_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "p"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "q"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "r_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "bounds": [
                                                                    "0",
                                                                    "o_t2"
                                                                ],
                                                                "child": {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "c_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": [
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_cin.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 4 * c2 + 2 * c7 + c8 + c10, 2 * p0 + 4 * c3 + c9 + c12, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "in.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 0, 0, c10, c11, c12, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "in.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "bounds": [
                                                                                                "0",
                                                                                                "i_t2"
                                                                                            ],
                                                                                            "child": {
                                                                                                "child": {
                                                                                                    "child": {
                                                                                                        "user_expr": "S_0(8 * c0 + 2 * c6 + c11, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 2 * p1 + 8 * c1 + c13, c8, c9)"
                                                                                                    },
                                                                                                    "type": "user"
                                                                                                },
                                                                                                "content": "hls_unroll",
                                                                                                "type": "mark"
                                                                                            },
                                                                                            "iterator": "c13",
                                                                                            "type": "for"
                                                                                        },
                                                                                        "content": "simd",
                                                                                        "type": "mark"
                                                                                    },
                                                                                    {
                                                                                        "child": {
                                                                                            "user_expr": "out.fifo_w.2.1(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p1 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_drain.1.1(c0, 1, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    },
                                                                                    {
                                                                                        "child": [
                                                                                            {
                                                                                                "child": {
                                                                                                    "user_expr": "out.fifo_cout_1.1.1(c0, c1, c2, c3, p0, p1, c6, c7, 2, 2, c10, c11, c12, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                                                },
                                                                                                "type": "user"
                                                                                            }
                                                                                        ],
                                                                                        "type": "if"
                                                                                    }
                                                                                ],
                                                                                "type": "block"
                                                                            },
                                                                            "content": "hls_pipeline",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c10",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "latency",
                                                                    "type": "mark"
                                                                },
                                                                "iterator": "c11",
                                                                "type": "for"
                                                            },
                                                            "content": "latency",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c12",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c0",
                                                "type": "for"
                                            },
                                            "iterator": "c1",
                                            "type": "for"
                                        },
                                        "iterator": "c9",
                                        "type": "for"
                                    },
                                    "iterator": "c8",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "child": {
                                                    "user_expr": "io_module.inter_intra.0.1()"
                                                },
                                                "type": "user"
                                            },
                                            {
                                                "child": {
                                                    "user_expr": "io_module.state_handle()"
                                                },
                                                "type": "user"
                                            }
                                        ],
                                        "type": "block"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p14",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cin_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "p"
                            ],
                            "child": {
                                "bounds": [
                                    "0",
                                    "q"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "r_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "o_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "c_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "child": {
                                                                        "user_expr": "out_trans.fifo_cin.fifo_cin_local.1.2.2(c0, c1, c2, c3, p0, p1, c6, c7, c8, c9, c10, c11, c12, 0, 4 * c2 + 2 * c7 + c8 + c10, 2 * p1 + 4 * c3 + c9 + c12, 2 * p0 + 8 * c1)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "content": "simd",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c10",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c11",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c12",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c0",
                                "type": "for"
                            },
                            "iterator": "c1",
                            "type": "for"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cin_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": [
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                {
                                                    "bounds": [
                                                        "0",
                                                        "(c_t1/c_t2)"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "data_pack_factor": "p14",
                                                                "ele_size": 4,
                                                                "last_dim": "i_t2",
                                                                "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                                "type": "array_tile"
                                                            },
                                                            "content": "access_coalesce",
                                                            "type": "mark"
                                                        },
                                                        "content": "io_L1",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                }
                                            ],
                                            "type": "if"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cin_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((r/r_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((c/c_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p14",
                                                        "ele_size": 4,
                                                        "last_dim": "i_t2",
                                                        "size": "((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1)*i_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_cout_1.fifo_cout_1_local.1.8.1(c0, 1, c2, c3, p0, 0, c6, c7, 0, 0, c10, c11, c12, 0, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p16",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_1_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(o_t1/o_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(r_t1/r_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "r_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "o_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "c_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans.fifo_cout_1_local.fifo_cout_1.1.8.1(c0, 0, c2, c3, p0, 3, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c7 + c10, 2 * p0 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c10",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c11",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c12",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c9",
                            "type": "for"
                        },
                        "iterator": "c8",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "cout_1_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_1_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(c_t1/c_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p16",
                                            "ele_size": 4,
                                            "last_dim": "o_t1",
                                            "size": "r_t1*c_t2*o_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((o/o_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((r/r_t1))"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "ceil((c/c_t1))"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "io_module.intra_inter.0.0()"
                                            },
                                            "type": "user"
                                        },
                                        "content": "io_L2",
                                        "type": "mark"
                                    },
                                    "content": "io_L3",
                                    "type": "mark"
                                },
                                "content": "array",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "iterator": "c3",
                        "type": "for"
                    },
                    "iterator": "c2",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "cout_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(c_t1/c_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p17",
                                "ele_size": 4,
                                "last_dim": "o_t1",
                                "size": "r_t1*c_t2*o_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c6",
            "type": "for"
        },
        "cout_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(o_t1/o_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(r_t1/r_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "r_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "o_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "c_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "user_expr": "in_trans.fifo_cout_drain_local.fifo_cout_drain.1.4.1(c0, 1, c2, c3, 3, p1, c6, c7, 2, 2, c10, c11, c12, 1, 4 * c2 + 2 * c7 + c10, 2 * p1 + 4 * c3 + c12, 8 * c0 + 2 * c6 + c11)"
                                                            },
                                                            "type": "user"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "content": "simd",
                                                    "type": "mark"
                                                },
                                                "iterator": "c10",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c11",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c12",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c9",
                        "type": "for"
                    },
                    "iterator": "c8",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "cout_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t1*c_t2*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(c_t1/c_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p17",
                                                        "ele_size": 4,
                                                        "last_dim": "o_t1",
                                                        "size": "r_t1*c_t2*o_t1",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "cout_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((r/r_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((c/c_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(c_t1/c_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p17",
                                                "ele_size": 4,
                                                "last_dim": "o_t1",
                                                "size": "r_t1*c_t2*o_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "iterator": "c3",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "w_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p18",
                                "ele_size": 4,
                                "last_dim": "i_t2",
                                "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c7",
            "type": "for"
        },
        "w_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((r/r_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((c/c_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(o_t1/o_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(r_t1/r_t2)"
                                            ],
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "p"
                                                ],
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "q"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "r_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "o_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "c_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "out_trans.fifo_w.fifo_w_local.1.2.2(c0, c1, c2, c3, p0, 0, c6, c7, c8, c9, c10, c11, c12, 0, 8 * c0 + 2 * c6 + c11, c8, c9, 2 * p0 + 8 * c1)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_pipeline",
                                                                                    "type": "mark"
                                                                                },
                                                                                "content": "simd",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c10",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "latency",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c11",
                                                                    "type": "for"
                                                                },
                                                                "content": "latency",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c12",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c0",
                                                    "type": "for"
                                                },
                                                "iterator": "c1",
                                                "type": "for"
                                            },
                                            "iterator": "c9",
                                            "type": "for"
                                        },
                                        "iterator": "c8",
                                        "type": "for"
                                    },
                                    "content": "pe",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c4",
                "type": "for"
            },
            "iterator": "c3",
            "type": "for"
        },
        "w_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((o/o_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p18",
                                        "ele_size": 4,
                                        "last_dim": "i_t2",
                                        "size": "o_t1*((p-1)+1)*((q-1)+1)*i_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c5",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "PE": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((c_t1/c_t2)*(i_t1/i_t2))"
        },
        "cin_IO_L1_in": {
            "array": "cin",
            "buf_size": "((((((((r_t1/r_t2)-1)*r_t2)+(r_t2-1))+(p-1))+1)*(((c_t2-1)+(q-1))+1))*i_t2)",
            "data_pack_factor_inter": "p14",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(c_t1/c_t2))"
        },
        "cout_1_IO_L2_in": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_1_IO_L2_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p16",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "cout_drain_IO_L1_out": {
            "array": "cout",
            "buf_size": "((r_t1*c_t2)*o_t1)",
            "data_pack_factor_inter": "p17",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(c_t1/c_t2)"
        },
        "w_IO_L2_in": {
            "array": "w",
            "buf_size": "(((o_t1*((p-1)+1))*((q-1)+1))*i_t2)",
            "data_pack_factor_inter": "p18",
            "data_pack_factor_intra": "i_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "q",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "p",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "o",
            "split_by": "o_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "r",
            "split_by": "r_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "c",
            "split_by": "c_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "c"
            ],
            "name": "c_t1",
            "split_by": "c_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "o"
            ],
            "name": "o_t1",
            "split_by": "o_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "r"
            ],
            "name": "r_t1",
            "split_by": "r_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "c_t1"
            ],
            "divisors": [
                "c_t1"
            ],
            "name": "c_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "o_t1"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "o_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "r_t1"
            ],
            "divisors": [
                "r_t1"
            ],
            "name": "r_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(i_t1,8)"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,4),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p14",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p15",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,16),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p16",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(o_t1,4),1)"
            ],
            "divisors": [
                "o_t1"
            ],
            "name": "p17",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "i_t2",
                "max(min(i_t2,16),i_t2)"
            ],
            "divisors": [
                "i_t2"
            ],
            "multiples": [
                "i_t2"
            ],
            "name": "p18",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel0_0.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c1",
            "type": "for"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p9",
                                        "ele_size": 4,
                                        "last_dim": "k_t1",
                                        "size": "i_t2*k_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p10",
                    "ele_size": 4,
                    "last_dim": "k_t1",
                    "size": "j_t1*k_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c2 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_C.fifo_C_local.1.4.1(c0, 1, c2, p0, c4, 0, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_local.fifo_C.1.4.1(c0, 0, c2, p0, c4, 15, c6, c7, 1, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(c0, 1, c2, p0, c4, 15, c6, c7, 1, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p12",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(c0, 1, c2, p0, 2 * p0 + 32 * c0 + c4, c5 + 32)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(k_t1/k_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "i_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "k_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c4 + c6, 32 * c1 + 2 * c5 + c8)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_unroll",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c8",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        {
                                                                            "child": [
                                                                                {
                                                                                    "child": {
                                                                                        "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, p0, c4, 15, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c4 + c6)"
                                                                                    },
                                                                                    "type": "user"
                                                                                }
                                                                            ],
                                                                            "type": "if"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "block"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c6",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c7",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c5",
                                            "type": "for"
                                        },
                                        "iterator": "c4",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(c0, 0, c2, p0, 2 * p0 + 32 * c0 + c4, c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel0_1.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c2 + c7, 32 * c0 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p9",
                                            "ele_size": 4,
                                            "last_dim": "k_t1",
                                            "size": "i_t2*k_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p10",
                    "ele_size": 4,
                    "last_dim": "k_t1",
                    "size": "j_t1*k_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c1 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_C.fifo_C_local.1.4.1(1, c1, c2, p0, c4, 0, c6, c7, 0, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_local.fifo_C.1.4.1(0, c1, c2, p0, c4, 15, c6, c7, 1, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(1, c1, c2, p0, c4, 15, c6, c7, 1, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p12",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(1, c1, c2, p0, 2 * p0 + c4 + 32, 32 * c1 + c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(k_t1/k_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "i_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "k_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "S_0(2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c4 + c6, 32 * c0 + 2 * c5 + c8)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_unroll",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c8",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        {
                                                                            "child": [
                                                                                {
                                                                                    "child": {
                                                                                        "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, p0, c4, 15, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c4 + c6)"
                                                                                    },
                                                                                    "type": "user"
                                                                                }
                                                                            ],
                                                                            "type": "if"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "block"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c6",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c7",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c5",
                                            "type": "for"
                                        },
                                        "iterator": "c4",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(0, c1, c2, p0, 2 * p0 + c4, 32 * c1 + c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel0_2.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)"
            ],
            "ele_type": "float",
            "num": "(i_t1/i_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p9",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "i_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "child": {
                "child": {
                    "child": {
                        "data_pack_factor": "p10",
                        "ele_size": 4,
                        "last_dim": "k_t1",
                        "size": "j_t1*k_t1",
                        "type": "array_tile"
                    },
                    "content": "access_serialize",
                    "type": "mark"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c1 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(c0, c1, 1, p0, c4, 15, c6, c7, 1, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c4 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t1",
                                            "size": "i_t2*j_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_serialize",
                                        "type": "mark"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": [
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "k_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c4 + c6, 32 * c2 + 2 * c5 + c8)"
                                                                                },
                                                                                "type": "user"
                                                                            },
                                                                            "content": "hls_unroll",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c8",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C_drain.1.1(c0, c1, 1, p0, c4, 15, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c4 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                }
                                                            ],
                                                            "type": "block"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel1_0.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "(j_t1/j_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p9",
                    "ele_size": 4,
                    "last_dim": "k_t1",
                    "size": "i_t1*k_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c0 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p10",
                                            "ele_size": 4,
                                            "last_dim": "k_t1",
                                            "size": "j_t2*k_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(c0, 1, c2, p0, c4, 0, c6, c7, 0, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c2 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(c0, 0, c2, p0, c4, 15, c6, c7, 1, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c2 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, 1, c2, p0, c4, 15, c6, c7, 1, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c2 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p12",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(c0, 1, c2, p0, 32 * c0 + c4, 2 * p0 + c5 + 32)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(k_t1/k_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "j_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "k_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "S_0(32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c8)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_unroll",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c8",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        {
                                                                            "child": [
                                                                                {
                                                                                    "child": {
                                                                                        "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, p0, c4, 15, c6, c7, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c2 + c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                }
                                                                            ],
                                                                            "type": "if"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c6, 32 * c1 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "block"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c6",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c7",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c5",
                                            "type": "for"
                                        },
                                        "iterator": "c4",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(c0, 0, c2, p0, 32 * c0 + c4, 2 * p0 + c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel1_1.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "(j_t1/j_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p9",
                    "ele_size": 4,
                    "last_dim": "k_t1",
                    "size": "i_t1*k_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c2 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c1 + c7, 32 * c0 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p10",
                                        "ele_size": 4,
                                        "last_dim": "k_t1",
                                        "size": "j_t2*k_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(1, c1, c2, p0, c4, 0, c6, c7, 0, 32 * c2 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(0, c1, c2, p0, c4, 15, c6, c7, 1, 32 * c2 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(1, c1, c2, p0, c4, 15, c6, c7, 1, 32 * c2 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p12",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(1, c1, c2, p0, c4 + 32, 2 * p0 + 32 * c1 + c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "(k_t1/k_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "bounds": [
                                                                "0",
                                                                "j_t2"
                                                            ],
                                                            "child": {
                                                                "child": {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c1 + c7, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "bounds": [
                                                                                    "0",
                                                                                    "k_t2"
                                                                                ],
                                                                                "child": {
                                                                                    "child": {
                                                                                        "child": {
                                                                                            "user_expr": "S_0(32 * c2 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7, 32 * c0 + 2 * c5 + c8)"
                                                                                        },
                                                                                        "type": "user"
                                                                                    },
                                                                                    "content": "hls_unroll",
                                                                                    "type": "mark"
                                                                                },
                                                                                "iterator": "c8",
                                                                                "type": "for"
                                                                            },
                                                                            "content": "simd",
                                                                            "type": "mark"
                                                                        },
                                                                        {
                                                                            "child": [
                                                                                {
                                                                                    "child": {
                                                                                        "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, p0, c4, 15, c6, c7, 32 * c2 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                                    },
                                                                                    "type": "user"
                                                                                }
                                                                            ],
                                                                            "type": "if"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c6, 32 * c0 + 2 * c5)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "block"
                                                                },
                                                                "content": "hls_pipeline",
                                                                "type": "mark"
                                                            },
                                                            "iterator": "c6",
                                                            "type": "for"
                                                        },
                                                        "content": "latency",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c7",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c5",
                                            "type": "for"
                                        },
                                        "iterator": "c4",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(0, c1, c2, p0, c4, 2 * p0 + 32 * c1 + c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel1_2.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "(j_t1/j_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.1.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "child": {
                "child": {
                    "child": {
                        "data_pack_factor": "p9",
                        "ele_size": 4,
                        "last_dim": "k_t1",
                        "size": "i_t1*k_t1",
                        "type": "array_tile"
                    },
                    "content": "access_serialize",
                    "type": "mark"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c0 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.4.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 2 * p0 + 32 * c1 + c7, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p10",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "j_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, c1, 1, p0, c4, 15, c6, c7, 1, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t2",
                                            "size": "i_t1*j_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_serialize",
                                        "type": "mark"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": [
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 2 * p0 + 32 * c1 + c7, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "k_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "user_expr": "S_0(32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7, 32 * c2 + 2 * c5 + c8)"
                                                                                },
                                                                                "type": "user"
                                                                            },
                                                                            "content": "hls_unroll",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c8",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C_drain.1.1(c0, c1, 1, p0, c4, 15, c6, c7, 32 * c0 + 2 * c4 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c6, 32 * c2 + 2 * c5)"
                                                                    },
                                                                    "type": "user"
                                                                }
                                                            ],
                                                            "type": "block"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,4),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel2_0.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "(k_t1/k_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c0 + 2 * c4 + c7, 2 * p0 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c1",
            "type": "for"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p9",
                                        "ele_size": 4,
                                        "last_dim": "k_t2",
                                        "size": "i_t1*k_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p10",
                                            "ele_size": 4,
                                            "last_dim": "k_t2",
                                            "size": "j_t1*k_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.inter_intra.1.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p11",
                    "ele_size": 4,
                    "last_dim": "j_t1",
                    "size": "i_t1*j_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_C.fifo_C_local.1.16.1(c0, 1, c2, 0, c4, c5, c6, c7, 0, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.1.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p11",
                    "ele_size": 4,
                    "last_dim": "j_t1",
                    "size": "i_t1*j_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_C_local.fifo_C.1.16.1(c0, 0, c2, 15, c4, c5, c6, c7, 1, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "child": {
                "child": [
                    {
                        "child": {
                            "data_pack_factor": "p12",
                            "ele_size": 4,
                            "last_dim": "j_t1",
                            "size": "i_t1*j_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    },
                    {
                        "child": {
                            "data_pack_factor": "p12",
                            "ele_size": 4,
                            "last_dim": "j_t1",
                            "size": "i_t1*j_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    }
                ],
                "type": "if"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(c0, 1, c2, 15, c4, c5, c6, c7, 1, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "data_pack_factor": "p12",
                                    "ele_size": 4,
                                    "last_dim": "j_t1",
                                    "size": "i_t1*j_t1",
                                    "type": "array_tile"
                                },
                                "content": "access_coalesce",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": [
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 2 * p0 + 32 * c1)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "k_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "user_expr": "S_0(32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c8)"
                                                                                },
                                                                                "type": "user"
                                                                            },
                                                                            "content": "hls_unroll",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c8",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, 15, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                }
                                                            ],
                                                            "type": "block"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel2_1.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "(k_t1/k_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c2 + 2 * c4 + c7, 2 * p0 + 32 * c0)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p9",
                                            "ele_size": 4,
                                            "last_dim": "k_t2",
                                            "size": "i_t1*k_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p10",
                                        "ele_size": 4,
                                        "last_dim": "k_t2",
                                        "size": "j_t1*k_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L1",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.inter_intra.1.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p11",
                    "ele_size": 4,
                    "last_dim": "j_t1",
                    "size": "i_t1*j_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "out_trans.fifo_C.fifo_C_local.1.16.1(1, c1, c2, 0, c4, c5, c6, c7, 0, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": [
                            {
                                "child": {
                                    "user_expr": "io_module.intra_inter.1.1()"
                                },
                                "type": "user"
                            },
                            {
                                "child": {
                                    "user_expr": "io_module.state_handle()"
                                },
                                "type": "user"
                            }
                        ],
                        "type": "block"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "child": {
                "child": {
                    "data_pack_factor": "p11",
                    "ele_size": 4,
                    "last_dim": "j_t1",
                    "size": "i_t1*j_t1",
                    "type": "array_tile"
                },
                "content": "access_coalesce",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "user_expr": "in_trans.fifo_C_local.fifo_C.1.16.1(0, c1, c2, 15, c4, c5, c6, c7, 1, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                        },
                                                        "type": "user"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "content": "simd",
                                                "type": "mark"
                                            },
                                            "iterator": "c6",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c7",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c5",
                            "type": "for"
                        },
                        "iterator": "c4",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.0()"
                            },
                            "type": "user"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "child": {
                "child": [
                    {
                        "child": {
                            "data_pack_factor": "p12",
                            "ele_size": 4,
                            "last_dim": "j_t1",
                            "size": "i_t1*j_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    },
                    {
                        "child": {
                            "data_pack_factor": "p12",
                            "ele_size": 4,
                            "last_dim": "j_t1",
                            "size": "i_t1*j_t1",
                            "type": "array_tile"
                        },
                        "content": "access_coalesce",
                        "type": "mark"
                    }
                ],
                "type": "if"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(1, c1, c2, 15, c4, c5, c6, c7, 1, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "data_pack_factor": "p12",
                                    "ele_size": 4,
                                    "last_dim": "j_t1",
                                    "size": "i_t1*j_t1",
                                    "type": "array_tile"
                                },
                                "content": "access_coalesce",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": [
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c7, 2 * p0 + 32 * c0)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c0)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "k_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "user_expr": "S_0(32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c0 + c8)"
                                                                                },
                                                                                "type": "user"
                                                                            },
                                                                            "content": "hls_unroll",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c8",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, 15, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c2 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                }
                                                            ],
                                                            "type": "block"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel2_2.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 1
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "(k_t1/k_t2)",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c0 + 2 * c4 + c7, 2 * p0 + 32 * c2)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p9",
                                                "ele_size": 4,
                                                "last_dim": "k_t2",
                                                "size": "i_t1*k_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, c4, c5, c6, c7, 0, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c2)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "iterator": "c4",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p10",
                                                "ele_size": 4,
                                                "last_dim": "k_t2",
                                                "size": "j_t1*k_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": [
                        {
                            "child": {
                                "user_expr": "io_module.intra_inter.1.1()"
                            },
                            "type": "user"
                        },
                        {
                            "child": {
                                "user_expr": "io_module.state_handle()"
                            },
                            "type": "user"
                        }
                    ],
                    "type": "block"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "data_pack_factor": "p11",
                            "ele_size": 4,
                            "last_dim": "j_t1",
                            "size": "i_t1*j_t1",
                            "type": "array_tile"
                        },
                        "content": "access_serialize",
                        "type": "mark"
                    },
                    "content": "access_coalesce",
                    "type": "mark"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "C_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans_reduce_+.fifo_C_local.fifo_C.1.16.1(c0, c1, c2, 15, c4, c5, c6, c7, 1, 32 * c0 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "io_L1",
                        "type": "mark"
                    },
                    "content": "io_L2",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": [
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 2 * p0 + 32 * c2)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c2)"
                                                                    },
                                                                    "type": "user"
                                                                },
                                                                {
                                                                    "child": [
                                                                        {
                                                                            "child": {
                                                                                "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                            },
                                                                            "type": "user"
                                                                        }
                                                                    ],
                                                                    "type": "if"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "bounds": [
                                                                            "0",
                                                                            "k_t2"
                                                                        ],
                                                                        "child": {
                                                                            "child": {
                                                                                "child": {
                                                                                    "user_expr": "S_0(32 * c0 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c8)"
                                                                                },
                                                                                "type": "user"
                                                                            },
                                                                            "content": "hls_unroll",
                                                                            "type": "mark"
                                                                        },
                                                                        "iterator": "c8",
                                                                        "type": "for"
                                                                    },
                                                                    "content": "simd",
                                                                    "type": "mark"
                                                                },
                                                                {
                                                                    "child": {
                                                                        "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, c4, c5, c6, c7, 32 * c0 + 2 * c4 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                    },
                                                                    "type": "user"
                                                                }
                                                            ],
                                                            "type": "block"
                                                        },
                                                        "content": "hls_pipeline",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "1"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel3_0.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c1",
            "type": "for"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p9",
                                        "ele_size": 4,
                                        "last_dim": "k_t1",
                                        "size": "i_t2*k_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c2 + c6, 32 * c1 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p10",
                                            "ele_size": 4,
                                            "last_dim": "k_t1",
                                            "size": "j_t2*k_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(c0, 1, c2, p0, p1, 0, c6, c7, 0, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c2 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(c0, 0, c2, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c2 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p11",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p11",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, 1, c2, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c2 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p12",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(c0, 1, c2, p0, p1, 2 * p0 + 32 * c0 + c5, 2 * p1 + c6 + 32)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(k_t1/k_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "j_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "i_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c2 + c6, 32 * c1 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "k_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c2 + c6, 32 * c1 + 2 * c5 + c8)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_unroll",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c8",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "simd",
                                                                        "type": "mark"
                                                                    },
                                                                    {
                                                                        "child": [
                                                                            {
                                                                                "child": {
                                                                                    "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, p0, p1, 15, c6, c7, 2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c2 + c6)"
                                                                                },
                                                                                "type": "user"
                                                                            }
                                                                        ],
                                                                        "type": "if"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c2 + c6, 32 * c1 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "block"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c6",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c7",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c5",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(c0, 0, c2, p0, p1, 2 * p0 + 32 * c0 + c5, 2 * p1 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel3_1.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L1_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L1_in": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c2 + c7, 32 * c0 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p9",
                                            "ele_size": 4,
                                            "last_dim": "k_t1",
                                            "size": "i_t2*k_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(k_t1/k_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c1 + c6, 32 * c0 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p10",
                                        "ele_size": 4,
                                        "last_dim": "k_t1",
                                        "size": "j_t2*k_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(1, c1, c2, p0, p1, 0, c6, c7, 0, 2 * p1 + 32 * c2 + c7, 2 * p0 + 32 * c1 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(0, c1, c2, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c2 + c7, 2 * p0 + 32 * c1 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p11",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p11",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p11",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(1, c1, c2, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c2 + c7, 2 * p0 + 32 * c1 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p12",
                                                "ele_size": 4,
                                                "last_dim": "j_t2",
                                                "size": "i_t2*j_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in.fifo_C.1.1(1, c1, c2, p0, p1, 2 * p0 + c5 + 32, 2 * p1 + 32 * c1 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(k_t1/k_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "j_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "bounds": [
                                                            "0",
                                                            "i_t2"
                                                        ],
                                                        "child": {
                                                            "child": {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c0 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c0 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "bounds": [
                                                                                "0",
                                                                                "k_t2"
                                                                            ],
                                                                            "child": {
                                                                                "child": {
                                                                                    "child": {
                                                                                        "user_expr": "S_0(2 * p0 + 32 * c2 + c7, 2 * p1 + 32 * c1 + c6, 32 * c0 + 2 * c5 + c8)"
                                                                                    },
                                                                                    "type": "user"
                                                                                },
                                                                                "content": "hls_unroll",
                                                                                "type": "mark"
                                                                            },
                                                                            "iterator": "c8",
                                                                            "type": "for"
                                                                        },
                                                                        "content": "simd",
                                                                        "type": "mark"
                                                                    },
                                                                    {
                                                                        "child": [
                                                                            {
                                                                                "child": {
                                                                                    "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, p0, p1, 15, c6, c7, 2 * p0 + 32 * c2 + c7, 2 * p1 + 32 * c1 + c6)"
                                                                                },
                                                                                "type": "user"
                                                                            }
                                                                        ],
                                                                        "type": "if"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c0 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c0 + 2 * c5)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "block"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "iterator": "c6",
                                                        "type": "for"
                                                    },
                                                    "content": "latency",
                                                    "type": "mark"
                                                },
                                                "iterator": "c7",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c5",
                                        "type": "for"
                                    },
                                    {
                                        "child": [
                                            {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out.fifo_C.1.1(0, c1, c2, p0, p1, 2 * p0 + c5, 2 * p1 + 32 * c1 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            }
                                        ],
                                        "type": "if"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L1_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "C_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel3_2.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(j_t1/j_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(j_t1/j_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "i_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p9",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "i_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t1",
                                "size": "j_t2*k_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(k_t1/k_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.16.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p10",
                                                "ele_size": 4,
                                                "last_dim": "k_t1",
                                                "size": "j_t2*k_t1",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "user_expr": "io_module.intra_inter.0.0()"
                                },
                                "type": "user"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t2*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "j_t2"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, c1, 1, p0, p1, 15, c6, c7, 1, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c1 + c6)"
                                            },
                                            "type": "user"
                                        },
                                        "content": "hls_pipeline",
                                        "type": "mark"
                                    },
                                    "content": "simd",
                                    "type": "mark"
                                },
                                "iterator": "c6",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c7",
                        "type": "for"
                    },
                    "content": "latency",
                    "type": "mark"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        },
                                        {
                                            "bounds": [
                                                "0",
                                                "(i_t1/i_t2)"
                                            ],
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p12",
                                                        "ele_size": 4,
                                                        "last_dim": "j_t2",
                                                        "size": "i_t2*j_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_coalesce",
                                                    "type": "mark"
                                                },
                                                "content": "io_L1",
                                                "type": "mark"
                                            },
                                            "iterator": "c3",
                                            "type": "for"
                                        }
                                    ],
                                    "type": "if"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t2*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_serialize",
                                                "type": "mark"
                                            },
                                            "content": "access_coalesce",
                                            "type": "mark"
                                        },
                                        "content": "io_L1",
                                        "type": "mark"
                                    },
                                    "iterator": "c3",
                                    "type": "for"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "i_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(c0, c1, 1, p0, p1, 15, c6, c7, 2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c1 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p1 + 32 * c1 + c6, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t1)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t1)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(i_t1/i_t2))"
        },
        "PE": {
            "array": "C",
            "buf_size": "(i_t2*j_t2)",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(j_t1/j_t2))"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t1,16),k_t2)"
            ],
            "divisors": [
                "k_t1"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel4_0.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c1)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p9",
                                                    "ele_size": 4,
                                                    "last_dim": "k_t2",
                                                    "size": "i_t2*k_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p10",
                                            "ele_size": 4,
                                            "last_dim": "k_t2",
                                            "size": "j_t1*k_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_C.fifo_C_local.1.16.1(c0, 1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_local.fifo_C.1.16.1(c0, 0, c2, p0, 15, c5, c6, c7, 1, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((j/j_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.0()"
                                        },
                                        "type": "user"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c1",
                        "type": "for"
                    },
                    "iterator": "c0",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(c0, 1, c2, 15, p1, c5, c6, c7, 1, 2 * p1 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t1",
                                                    "size": "i_t2*j_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t1",
                                                    "size": "i_t2*j_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    }
                                ],
                                "type": "if"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t1",
                                            "size": "i_t2*j_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "i_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c1 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, p0, 15, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c2 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(i_t1/i_t2))"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel4_1.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c2 + c7, 2 * p0 + 32 * c0)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p9",
                                                    "ele_size": 4,
                                                    "last_dim": "k_t2",
                                                    "size": "i_t2*k_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p10",
                                        "ele_size": 4,
                                        "last_dim": "k_t2",
                                        "size": "j_t1*k_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_C.fifo_C_local.1.16.1(1, c1, c2, p0, 0, c5, c6, c7, 0, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_local.fifo_C.1.16.1(0, c1, c2, p0, 15, c5, c6, c7, 1, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.0()"
                                        },
                                        "type": "user"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c0",
                        "type": "for"
                    },
                    "iterator": "c1",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t1",
                                "size": "i_t2*j_t1",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.4.1(1, c1, c2, 15, p1, c5, c6, c7, 1, 2 * p1 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t1",
                                                    "size": "i_t2*j_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t1",
                                                    "size": "i_t2*j_t1",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    }
                                ],
                                "type": "if"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t1",
                                            "size": "i_t2*j_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "i_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c0 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, p0, 15, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(i_t1/i_t2))"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,4),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel4_2.json
================================================
{
    "attr": {
        "A_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(i_t1/i_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((i_t1/i_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(i_t1/i_t2)"
            ]
        },
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(i_t1/i_t2)"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(i_t1/i_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "A_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "j_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "i_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c0 + c7, 2 * p0 + 32 * c2)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(i_t1/i_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p9",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "i_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(i_t1/i_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p9",
                                                        "ele_size": 4,
                                                        "last_dim": "k_t2",
                                                        "size": "i_t2*k_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_serialize",
                                                    "type": "mark"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "B_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(j_t1/j_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "j_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c1 + 2 * c5 + c6, 2 * p0 + 32 * c2)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p10",
                                                "ele_size": 4,
                                                "last_dim": "k_t2",
                                                "size": "j_t1*k_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": [
                        {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.1()"
                            },
                            "type": "user"
                        },
                        {
                            "child": {
                                "user_expr": "io_module.state_handle()"
                            },
                            "type": "user"
                        }
                    ],
                    "type": "block"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t1",
                                        "size": "i_t2*j_t1",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                }
                            ],
                            "type": "if"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "content": "io_L3",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "C_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(j_t1/j_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "j_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "i_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans_reduce_+.fifo_C_local.fifo_C.1.16.1(c0, c1, c2, p0, 15, c5, c6, c7, 1, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(i_t1/i_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p11",
                                            "ele_size": 4,
                                            "last_dim": "j_t1",
                                            "size": "i_t2*j_t1",
                                            "type": "array_tile"
                                        },
                                        "content": "access_serialize",
                                        "type": "mark"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "i_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c2 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c0 + c7, 32 * c1 + 2 * c5 + c6)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c1 + 2 * c5 + c6, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L1_in": {
            "array": "A",
            "buf_size": "(i_t2*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(i_t1/i_t2))"
        },
        "B_IO_L2_in": {
            "array": "B",
            "buf_size": "(j_t1*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t2*j_t1)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(i_t1/i_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t1,16),1)"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel5_0.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)",
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c1",
            "type": "for"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(k_t1/k_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p9",
                                        "ele_size": 4,
                                        "last_dim": "k_t2",
                                        "size": "i_t1*k_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c4",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c2 + c7, 2 * p0 + 32 * c1)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p10",
                                                    "ele_size": 4,
                                                    "last_dim": "k_t2",
                                                    "size": "j_t2*k_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(c0, 1, c2, p0, 0, c5, c6, c7, 0, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(c0, 0, c2, p0, 15, c5, c6, c7, 1, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((j/j_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.0()"
                                        },
                                        "type": "user"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c1",
                        "type": "for"
                    },
                    "iterator": "c0",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(c0, 1, c2, 15, p1, c5, c6, c7, 1, 32 * c0 + 2 * c5 + c6, 2 * p1 + 32 * c2 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t1*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t1*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    }
                                ],
                                "type": "if"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t2",
                                            "size": "i_t1*j_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((k/k_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "j_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c2 + c7, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7, 2 * p1 + 32 * c1 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(c0, 1, c2, p0, 15, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p1 + 32 * c1)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c1",
                    "type": "for"
                },
                "iterator": "c2",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(j_t1/j_t2))"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,16),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel5_1.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_drain_IO_L1_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_inter": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L1_out_intra": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_drain_IO_L2_out": {
            "double_buffer": 0,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "C_drain_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_in": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        },
        "C_drain_IO_L1_out": {
            "dims": [
                "(k_t1/k_t2)",
                "(j_t1/j_t2)"
            ]
        },
        "C_drain_IO_L2_out": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "C_drain_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c0)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p9",
                                            "ele_size": 4,
                                            "last_dim": "k_t2",
                                            "size": "i_t1*k_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c1 + c7, 2 * p0 + 32 * c0)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p10",
                                                    "ele_size": 4,
                                                    "last_dim": "k_t2",
                                                    "size": "j_t2*k_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.inter_intra.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_C.fifo_C_local.1.2.1(1, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "user_expr": "io_module.intra_inter.0.1()"
                                    },
                                    "type": "user"
                                },
                                {
                                    "child": {
                                        "user_expr": "io_module.state_handle()"
                                    },
                                    "type": "user"
                                }
                            ],
                            "type": "block"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p11",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_IO_L2_out_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "in_trans.fifo_C_local.fifo_C.1.2.1(0, c1, c2, p0, 15, c5, c6, c7, 1, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "C_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L1_out": {
            "child": [
                {
                    "bounds": [
                        "0",
                        "ceil((j/j_t1))"
                    ],
                    "child": {
                        "bounds": [
                            "0",
                            "ceil((i/i_t1))"
                        ],
                        "child": {
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "user_expr": "io_module.intra_inter.0.0()"
                                        },
                                        "type": "user"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "content": "io_L3",
                                "type": "mark"
                            },
                            "content": "array",
                            "type": "mark"
                        },
                        "iterator": "c0",
                        "type": "for"
                    },
                    "iterator": "c1",
                    "type": "for"
                }
            ],
            "type": "if"
        },
        "C_drain_IO_L1_out_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p12",
                                "ele_size": 4,
                                "last_dim": "j_t2",
                                "size": "i_t1*j_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "C_drain_IO_L1_out_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "in_trans.fifo_C_drain_local.fifo_C_drain.1.2.1(1, c1, c2, 15, p1, c5, c6, c7, 1, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c1 + c7)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "C_drain_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t1*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "data_pack_factor": "p12",
                                                    "ele_size": 4,
                                                    "last_dim": "j_t2",
                                                    "size": "i_t1*j_t2",
                                                    "type": "array_tile"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    }
                                ],
                                "type": "if"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "C_drain_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((j/j_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((i/i_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(j_t1/j_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p12",
                                            "ele_size": 4,
                                            "last_dim": "j_t2",
                                            "size": "i_t1*j_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L1",
                                    "type": "mark"
                                },
                                "iterator": "c3",
                                "type": "for"
                            },
                            "content": "io_L2",
                            "type": "mark"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c0",
                "type": "for"
            },
            "iterator": "c1",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((i/i_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "j_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c1 + c7, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7, 2 * p1 + 32 * c0 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C_drain.1.1(1, c1, c2, p0, 15, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    },
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c2 + 2 * c5 + c6, 2 * p1 + 32 * c0)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c0",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c2",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(j_t1/j_t2))"
        },
        "C_IO_L2_in": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        },
        "C_drain_IO_L1_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p12",
            "data_pack_factor_intra": "1",
            "double_buffer": 0,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,16),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,4),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p12",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/designs_lib/gemm/kernel5_2.json
================================================
{
    "attr": {
        "A_IO_L2_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L2_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "A_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "B_IO_L1_in": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L1_in_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "B_IO_L2_in": {
            "double_buffer": 0,
            "filter": 1,
            "in": 1,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        },
        "B_IO_L3_in": {
            "double_buffer": 0,
            "filter": 0,
            "in": 1,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "C_IO_L2_out": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_inter": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L2_out_intra": {
            "double_buffer": 1,
            "filter": 1,
            "in": 0,
            "io": 1,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 1
        },
        "C_IO_L3_out": {
            "double_buffer": 0,
            "filter": 0,
            "in": 0,
            "io": 1,
            "serialize": 1,
            "to_dram": 1,
            "to_pe": 0
        },
        "PE": {
            "double_buffer": 0,
            "filter": 0,
            "in": -1,
            "io": 0,
            "serialize": 0,
            "to_dram": 0,
            "to_pe": 0
        }
    },
    "compute": {
        "PE": {
            "dims": [
                "(j_t1/j_t2)",
                "(k_t1/k_t2)"
            ],
            "ele_type": "float",
            "num": "((j_t1/j_t2)*(k_t1/k_t2))",
            "unroll_factor": "k_t2"
        }
    },
    "io": {
        "A_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "A_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "B_IO_L1_in": {
            "dims": [
                "(k_t1/k_t2)",
                "(j_t1/j_t2)"
            ]
        },
        "B_IO_L2_in": {
            "dims": [
                "(k_t1/k_t2)"
            ]
        },
        "B_IO_L3_in": {
            "dims": [
                "1"
            ]
        },
        "C_IO_L2_out": {
            "dims": [
                "(j_t1/j_t2)"
            ]
        },
        "C_IO_L3_out": {
            "dims": [
                "1"
            ]
        }
    },
    "latency": {
        "A_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": [
                                    {
                                        "child": {
                                            "user_expr": "io_module.inter_intra.0.1()"
                                        },
                                        "type": "user"
                                    },
                                    {
                                        "child": {
                                            "user_expr": "io_module.state_handle()"
                                        },
                                        "type": "user"
                                    }
                                ],
                                "type": "block"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "A_IO_L2_in_inter": {
            "bounds": [
                "0",
                "(k_t1/k_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p9",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "i_t1*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L2",
                "type": "mark"
            },
            "iterator": "c4",
            "type": "for"
        },
        "A_IO_L2_in_intra": {
            "child": {
                "child": {
                    "child": {
                        "bounds": [
                            "0",
                            "(i_t1/i_t2)"
                        ],
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "i_t2"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "j_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "user_expr": "out_trans.fifo_A.fifo_A_local.1.2.2(c0, c1, c2, p0, 0, c5, c6, c7, 0, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c2)"
                                                    },
                                                    "type": "user"
                                                },
                                                "content": "hls_pipeline",
                                                "type": "mark"
                                            },
                                            "content": "simd",
                                            "type": "mark"
                                        },
                                        "iterator": "c6",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c7",
                                "type": "for"
                            },
                            "content": "latency",
                            "type": "mark"
                        },
                        "iterator": "c5",
                        "type": "for"
                    },
                    "content": "pe",
                    "type": "mark"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "content": "io_L2",
            "type": "mark"
        },
        "A_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": {
                                            "child": {
                                                "data_pack_factor": "p9",
                                                "ele_size": 4,
                                                "last_dim": "k_t2",
                                                "size": "i_t1*k_t2",
                                                "type": "array_tile"
                                            },
                                            "content": "access_serialize",
                                            "type": "mark"
                                        },
                                        "content": "access_coalesce",
                                        "type": "mark"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "child": [
                                        {
                                            "child": {
                                                "user_expr": "io_module.inter_intra.0.1()"
                                            },
                                            "type": "user"
                                        },
                                        {
                                            "child": {
                                                "user_expr": "io_module.state_handle()"
                                            },
                                            "type": "user"
                                        }
                                    ],
                                    "type": "block"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L1_in_inter": {
            "bounds": [
                "0",
                "(j_t1/j_t2)"
            ],
            "child": {
                "child": {
                    "child": [
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        },
                        {
                            "child": {
                                "data_pack_factor": "p10",
                                "ele_size": 4,
                                "last_dim": "k_t2",
                                "size": "j_t2*k_t2",
                                "type": "array_tile"
                            },
                            "content": "access_coalesce",
                            "type": "mark"
                        }
                    ],
                    "type": "if"
                },
                "content": "io_L1",
                "type": "mark"
            },
            "iterator": "c3",
            "type": "for"
        },
        "B_IO_L1_in_intra": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(i_t1/i_t2)"
                    ],
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "i_t2"
                            ],
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "j_t2"
                                    ],
                                    "child": {
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "user_expr": "out_trans.fifo_B.fifo_B_local.1.2.2(c0, c1, c2, p0, p1, c5, c6, c7, 0, 2 * p1 + 32 * c1 + c7, 2 * p0 + 32 * c2)"
                                                },
                                                "type": "user"
                                            },
                                            "content": "hls_pipeline",
                                            "type": "mark"
                                        },
                                        "content": "simd",
                                        "type": "mark"
                                    },
                                    "iterator": "c6",
                                    "type": "for"
                                },
                                "content": "latency",
                                "type": "mark"
                            },
                            "iterator": "c7",
                            "type": "for"
                        },
                        "content": "latency",
                        "type": "mark"
                    },
                    "iterator": "c5",
                    "type": "for"
                },
                "content": "pe",
                "type": "mark"
            },
            "content": "io_L1",
            "type": "mark"
        },
        "B_IO_L2_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "child": [
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            },
                                            {
                                                "bounds": [
                                                    "0",
                                                    "(j_t1/j_t2)"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": {
                                                            "data_pack_factor": "p10",
                                                            "ele_size": 4,
                                                            "last_dim": "k_t2",
                                                            "size": "j_t2*k_t2",
                                                            "type": "array_tile"
                                                        },
                                                        "content": "access_coalesce",
                                                        "type": "mark"
                                                    },
                                                    "content": "io_L1",
                                                    "type": "mark"
                                                },
                                                "iterator": "c3",
                                                "type": "for"
                                            }
                                        ],
                                        "type": "if"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "B_IO_L3_in": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(k_t1/k_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "(j_t1/j_t2)"
                                        ],
                                        "child": {
                                            "child": {
                                                "child": {
                                                    "child": {
                                                        "data_pack_factor": "p10",
                                                        "ele_size": 4,
                                                        "last_dim": "k_t2",
                                                        "size": "j_t2*k_t2",
                                                        "type": "array_tile"
                                                    },
                                                    "content": "access_serialize",
                                                    "type": "mark"
                                                },
                                                "content": "access_coalesce",
                                                "type": "mark"
                                            },
                                            "content": "io_L1",
                                            "type": "mark"
                                        },
                                        "iterator": "c3",
                                        "type": "for"
                                    },
                                    "content": "io_L2",
                                    "type": "mark"
                                },
                                "iterator": "c4",
                                "type": "for"
                            },
                            "content": "io_L3",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": [
                        {
                            "child": {
                                "user_expr": "io_module.intra_inter.0.1()"
                            },
                            "type": "user"
                        },
                        {
                            "child": {
                                "user_expr": "io_module.state_handle()"
                            },
                            "type": "user"
                        }
                    ],
                    "type": "block"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "C_IO_L2_out_inter": {
            "child": {
                "child": {
                    "bounds": [
                        "0",
                        "(j_t1/j_t2)"
                    ],
                    "child": {
                        "child": {
                            "child": [
                                {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                {
                                    "child": {
                                        "data_pack_factor": "p11",
                                        "ele_size": 4,
                                        "last_dim": "j_t2",
                                        "size": "i_t1*j_t2",
                                        "type": "array_tile"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                }
                            ],
                            "type": "if"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "iterator": "c3",
                    "type": "for"
                },
                "content": "io_L3",
                "type": "mark"
            },
            "content": "array",
            "type": "mark"
        },
        "C_IO_L2_out_intra": {
            "bounds": [
                "0",
                "ceil((k/k_t1))"
            ],
            "child": {
                "child": {
                    "child": {
                        "child": {
                            "child": {
                                "child": {
                                    "bounds": [
                                        "0",
                                        "(i_t1/i_t2)"
                                    ],
                                    "child": {
                                        "child": {
                                            "bounds": [
                                                "0",
                                                "i_t2"
                                            ],
                                            "child": {
                                                "child": {
                                                    "bounds": [
                                                        "0",
                                                        "j_t2"
                                                    ],
                                                    "child": {
                                                        "child": {
                                                            "child": {
                                                                "child": {
                                                                    "user_expr": "in_trans_reduce_+.fifo_C_local.fifo_C.1.2.1(c0, c1, c2, p0, 15, c5, c6, c7, 1, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            "content": "hls_pipeline",
                                                            "type": "mark"
                                                        },
                                                        "content": "simd",
                                                        "type": "mark"
                                                    },
                                                    "iterator": "c6",
                                                    "type": "for"
                                                },
                                                "content": "latency",
                                                "type": "mark"
                                            },
                                            "iterator": "c7",
                                            "type": "for"
                                        },
                                        "content": "latency",
                                        "type": "mark"
                                    },
                                    "iterator": "c5",
                                    "type": "for"
                                },
                                "content": "pe",
                                "type": "mark"
                            },
                            "content": "io_L1",
                            "type": "mark"
                        },
                        "content": "io_L2",
                        "type": "mark"
                    },
                    "content": "io_L3",
                    "type": "mark"
                },
                "content": "array",
                "type": "mark"
            },
            "iterator": "c2",
            "type": "for"
        },
        "C_IO_L3_out": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "child": {
                        "child": {
                            "bounds": [
                                "0",
                                "(j_t1/j_t2)"
                            ],
                            "child": {
                                "child": {
                                    "child": {
                                        "child": {
                                            "data_pack_factor": "p11",
                                            "ele_size": 4,
                                            "last_dim": "j_t2",
                                            "size": "i_t1*j_t2",
                                            "type": "array_tile"
                                        },
                                        "content": "access_serialize",
                                        "type": "mark"
                                    },
                                    "content": "access_coalesce",
                                    "type": "mark"
                                },
                                "content": "io_L2",
                                "type": "mark"
                            },
                            "iterator": "c3",
                            "type": "for"
                        },
                        "content": "io_L3",
                        "type": "mark"
                    },
                    "content": "array",
                    "type": "mark"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        },
        "PE": {
            "bounds": [
                "0",
                "ceil((i/i_t1))"
            ],
            "child": {
                "bounds": [
                    "0",
                    "ceil((j/j_t1))"
                ],
                "child": {
                    "bounds": [
                        "0",
                        "ceil((k/k_t1))"
                    ],
                    "child": {
                        "child": {
                            "child": {
                                "bounds": [
                                    "0",
                                    "(i_t1/i_t2)"
                                ],
                                "child": {
                                    "child": {
                                        "bounds": [
                                            "0",
                                            "i_t2"
                                        ],
                                        "child": {
                                            "child": {
                                                "bounds": [
                                                    "0",
                                                    "j_t2"
                                                ],
                                                "child": {
                                                    "child": {
                                                        "child": [
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "in.fifo_B.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 2 * p0 + 32 * c1 + c7, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": [
                                                                    {
                                                                        "child": {
                                                                            "user_expr": "in.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                        },
                                                                        "type": "user"
                                                                    }
                                                                ],
                                                                "type": "if"
                                                            },
                                                            {
                                                                "child": {
                                                                    "bounds": [
                                                                        "0",
                                                                        "k_t2"
                                                                    ],
                                                                    "child": {
                                                                        "child": {
                                                                            "child": {
                                                                                "user_expr": "S_0(32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7, 2 * p1 + 32 * c2 + c8)"
                                                                            },
                                                                            "type": "user"
                                                                        },
                                                                        "content": "hls_unroll",
                                                                        "type": "mark"
                                                                    },
                                                                    "iterator": "c8",
                                                                    "type": "for"
                                                                },
                                                                "content": "simd",
                                                                "type": "mark"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_C.1.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p0 + 32 * c1 + c7)"
                                                                },
                                                                "type": "user"
                                                            },
                                                            {
                                                                "child": {
                                                                    "user_expr": "out.fifo_A.2.1(c0, c1, c2, p0, p1, c5, c6, c7, 32 * c0 + 2 * c5 + c6, 2 * p1 + 32 * c2)"
                                                                },
                                                                "type": "user"
                                                            }
                                                        ],
                                                        "type": "block"
                                                    },
                                                    "content": "hls_pipeline",
                                                    "type": "mark"
                                                },
                                                "iterator": "c6",
                                                "type": "for"
                                            },
                                            "content": "latency",
                                            "type": "mark"
                                        },
                                        "iterator": "c7",
                                        "type": "for"
                                    },
                                    "content": "latency",
                                    "type": "mark"
                                },
                                "iterator": "c5",
                                "type": "for"
                            },
                            "content": "pe",
                            "type": "mark"
                        },
                        "content": "array",
                        "type": "mark"
                    },
                    "iterator": "c2",
                    "type": "for"
                },
                "iterator": "c1",
                "type": "for"
            },
            "iterator": "c0",
            "type": "for"
        }
    },
    "memory": {
        "A_IO_L2_in": {
            "array": "A",
            "buf_size": "(i_t1*k_t2)",
            "data_pack_factor_inter": "p9",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(k_t1/k_t2)"
        },
        "B_IO_L1_in": {
            "array": "B",
            "buf_size": "(j_t2*k_t2)",
            "data_pack_factor_inter": "p10",
            "data_pack_factor_intra": "k_t2",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "((k_t1/k_t2)*(j_t1/j_t2))"
        },
        "C_IO_L2_out": {
            "array": "C",
            "buf_size": "(i_t1*j_t2)",
            "data_pack_factor_inter": "p11",
            "data_pack_factor_intra": "1",
            "double_buffer": 1,
            "ele_size": 4,
            "ele_type": "float",
            "num": "(j_t1/j_t2)"
        }
    },
    "params": [
        {
            "attr": "loop_ub",
            "name": "i",
            "split_by": "i_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "j",
            "split_by": "j_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "loop_ub",
            "name": "k",
            "split_by": "k_t1",
            "tags": [
                "external"
            ],
            "tunable": false
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "j"
            ],
            "name": "j_t1",
            "split_by": "j_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "k"
            ],
            "name": "k_t1",
            "split_by": "k_t2",
            "tunable": true
        },
        {
            "attr": "array_part_tiling_factor",
            "bounds": [
                "1",
                "i"
            ],
            "name": "i_t1",
            "split_by": "i_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "j_t1"
            ],
            "divisors": [
                "j_t1"
            ],
            "name": "j_t2",
            "tunable": true
        },
        {
            "attr": "latency_tiling_factor",
            "bounds": [
                "1",
                "i_t1"
            ],
            "divisors": [
                "i_t1"
            ],
            "name": "i_t2",
            "tunable": true
        },
        {
            "attr": "SIMD_tiling_factor",
            "bounds": [
                "1",
                "min(k_t1,8)"
            ],
            "divisors": [
                "k_t1"
            ],
            "name": "k_t2",
            "tags": [
                "power_of_two"
            ],
            "tunable": true
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,16),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p9",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "k_t2",
                "max(min(k_t2,4),k_t2)"
            ],
            "divisors": [
                "k_t2"
            ],
            "multiples": [
                "k_t2"
            ],
            "name": "p10",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        },
        {
            "attr": "data_pack_factor",
            "bounds": [
                "1",
                "max(min(j_t2,16),1)"
            ],
            "divisors": [
                "j_t2"
            ],
            "name": "p11",
            "tags": [
                "power_of_two",
                "auto_infer"
            ],
            "tunable": false
        }
    ]
}


================================================
FILE: autosa_scripts/odyssey/explorer.py
================================================
import copy
import pprint
import numpy as np
import random

import utils
import tuners
from search_task import SingleTask, MultiTask

class ArchExplorer(object):
    """ Architecture explorer.
    """
    def __init__(self, cst, search_obj, max_epochs, max_time, search_config, designs, workloads):
        self.cst = cst
        self.search_obj = search_obj
        self.max_epochs = max_epochs
        self.max_time = max_time
        self.search_config = search_config
        self.designs = designs
        self.workloads = workloads

    def search(self):
        """ The gateway function to perform architecture search.
        The input is a list of design descriptions "designs"
        and a list of searching tasks "tasks".
        """
        best_record = utils.SearchRecord().reset()

        if self.search_config["explore_fusion"]:
            if self.search_config["explore_multi_acc"]:
                if self.search_config["method"] == "customized1":
                    best_record = self.search_fusion_multi_acc_customized1()
                elif self.search_config["method"] == "customized2":
                    best_record = self.search_fusion_multi_acc_customized2()
                    #best_record = self.search_fusion_multi_acc_customized2(design_idx=4)
            else:
                if self.search_config["method"] == "exhaustive":
                    best_record = self.search_fusion_single_acc_exhaustive() # TODO
                elif self.search_config["method"] == "customized1":
                    #best_record = self.search_fusion_single_acc_customized1(design_idx=4)
                    best_record = self.search_fusion_single_acc_customized1()
                elif self.search_config["method"] == "customized2":
                    best_record = self.search_fusion_single_acc_customized2()
                else:
                    raise NotImplementedError("Undefined multi-accelerator search method.")
        else:
            if self.search_config["explore_programmable"]:
                if self.search_config["method"] == "customized1":
                    best_record = self.search_programmable_single_acc_customized1() # TODO
                else:
                    raise NotImplementedError("Undefined single programmable accelerator search method.")
            else:
                if self.search_config["method"] == "customized1":
                    best_record = self.search_non_fusion_single_acc_customized1(design_idx=self.search_config["design_idx"])
                    #best_record = self.search_non_fusion_single_acc_customized1(design_idx=4)
                else:
                    raise NotImplementedError("Undefined single accelerator search method.")

        return best_record

    def tune(self, search_task, init_tasks=None, silent=0, use_cache=-1, meta=None):
        """ Call tuners for the searching task.
        init_tasks contains candidates for the initial population of the genetic search.
        meta contains additional information used during the tuning.
        """
        if use_cache == -1:
            use_cache = self.search_config['use_db']
        if use_cache:
            # Check if the search task has been searched
            if str(search_task) in self.search_config["search_records_db"]:
                return self.search_config["search_records_db"][str(search_task)]
                #return self.search_config["search_records_db"][str(search_task)], self.search_config["search_records_db"]

        if isinstance(search_task, SingleTask):
            if self.search_config['unit_task_method'] == "genetic":
                # Use genetic search
                search_record = tuners.genetic_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "random_pruning":
                search_record = tuners.random_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, pruning=1, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "random":
                search_record = tuners.random_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "exhaustive_pruning":                
                search_record = tuners.exhaustive_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, pruning=1, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "annealing":
                search_record = tuners.annealing_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "bayesian":
                search_record = tuners.bayesian_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "RL":
                search_record = tuners.RL_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"])
            elif self.search_config["unit_task_method"] == "open_tuner":
                search_record = tuners.opentuner_search(search_task, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=1, silent=silent, profiling=self.search_config["profiling"], args=self.search_config["args"])
            else:
                raise NotImplementedError("Undefined unit task method.")
        elif isinstance(search_task, MultiTask):
            if search_task.fuse == 0:
                if search_task.split == 0:
                    search_record = tuners.non_fuse_genetic_search(search_task, init_tasks, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                        n_worker=self.search_config['n_worker'], silent=silent, population_size=self.search_config['genetic_params']['population_size'][1], meta=meta)
                else:
                    if self.search_config["method"] == "customized1":
                        search_record = tuners.multi_acc_search1(search_task, init_tasks, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                            n_worker=self.search_config['n_worker'], silent=silent, population_size=self.search_config['genetic_params']['population_size'][1], \
                            meta=meta, explorer=self, profiling=self.search_config["profiling"])
                    elif self.search_config["method"] == "customized2":
                        search_record = tuners.multi_acc_search2(search_task, init_tasks, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                            n_worker=self.search_config['n_worker'], silent=silent, population_size=self.search_config['genetic_params']['population_size'][1], \
                            meta=meta, explorer=self, profiling=self.search_config["profiling"])
            elif search_task.fuse == 1:
                search_record = tuners.fuse_genetic_search(search_task, init_tasks, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=self.search_config['n_worker'], silent=silent, population_size=self.search_config['genetic_params']['population_size'][1], meta=meta, explorer=self)
            elif search_task.fuse == 2:
                search_record = tuners.all_fuse_genetic_search(search_task, init_tasks, self.cst, self.search_obj, self.max_epochs, self.max_time, \
                    n_worker=self.search_config['n_worker'], silent=silent, population_size=self.search_config['genetic_params']['population_size'][1], explorer=self)
            else:
                raise RuntimeError('Unknown search task type.')
        else:
            raise RuntimeError('Unknown search task type.')

        '''
        # Save the search results
        if str(search_task) in self.search_config["search_records_db"]:
            self.search_config["search_records_db"][str(search_task)].update(search_record)
        else:
            self.search_config["search_records_db"][str(search_task)] = search_record
        '''

        return search_record
        #return search_record, self.search_config["search_records_db"]

    def search_non_fusion_single_acc_exhaustive(self):
        raise NotImplementedError("Unimplemented single accelerator search method.")

    def search_non_fusion_single_acc_customized1(self, design_idx=-1, search_task_configs=None, early_stop=-1, silent=0, workload_idx=None, prev_array=None, one_gen=False):
        """ This function searches the best single accelerator for the search tasks.
        We assume the tasks are executed in sequence on the acclerator.
        The function first searches the best array configuration for each task.
        The results are served as the initial candidate pool to kick off the
        evolutionary search which searches for the best array configuration
        that maximizes the overall performance.
        Modify the search task configurations when the search_task_configs is valid.

        If early_stop is set (not equal to -1), the search will be terminated
        if the ideal latency is longer than the early_stop threshold.
        If URAM is used, we will run the non-fuse search for one time and identify the
        bottleneck of each layer. Following the increasing order of CTC ratio,
        we check three arrasy: cin, cout, and w.
        If any of them is the bottleneck, we will try to store them on-chip.
        This process stops until there is no more URAM available on-chip.

        "prev_array" is used for the TGPA-style multi-array setting.
        When prev_array is set, when searching the solution of the current array,
        the latency of each workload is adjusted to consider the setup latency.
        """
        design_list = self.designs
        if design_idx != -1:
            # Only search a certain design
            design_list = [self.designs[design_idx]]

        if workload_idx:
            workloads = [self.workloads[i] for i in workload_idx]
        else:
            workloads = self.workloads

        # Test1: Fix r-axis to one
        #search_task_configs = {}
        #for i in range(len(self.workloads)):
        #    search_task_configs[i] = {'fix_param': [['r', 1]]}

        # Test2: Equate c_t1 = r_t1
        #search_task_configs = {}
        #for i in range(len(self.workloads)):
        #    search_task_configs[i] = {'equate_params': [['r_t1', 'c_t1']]}

        def est_URAM(width, depth):
            """ Estimate URAM usage.
            """
            mem = np.ceil(width / 72) * np.ceil(depth / 4096)
            return mem

        def modify_task_configs_uram(layer_infos, workloads, configs):
            if not configs:
                configs = {}
                for layer_idx in range(len(layer_infos)):
                    configs[layer_idx] = {"cin_read_mode": 0, "w_read_mode": 0, "cout_write_mode": 0}
            c_mem = []
            for layer_idx in range(len(layer_infos)):
                c_mem.append([0, 0]) # input, output
            w_mem = 0
            def take_item(elem):
                return elem["item"]
            def take_value(elem):
                return elem["value"]
            def cal_c_mem(c_mem):
                total_c_mem = [m[0] + m[1] for m in c_mem]
                return max(total_c_mem)
            for layer_info in layer_infos:
                workload = workloads[layer_info["idx"]]
                if cal_c_mem(c_mem) + w_mem >= self.cst.hw_cst["URAM"]:
                    break
                PE_latency = layer_info["reward_meta"]["latency_main"]["PE_latency"]
                cin_latency = [{"item": x, "value": layer_info["reward_meta"]["latency_main"][x]} for x in layer_info["reward_meta"]["latency_main"] if x.startswith("cin")]
                cin_latency.sort(key=take_value)
                cout_latency = [{"item": x, "value": layer_info["reward_meta"]["latency_main"][x]} for x in layer_info["reward_meta"]["latency_main"] if x.startswith("cout")]
                cout_latency.sort(key=take_value)
                w_latency = [{"item": x, "value": layer_info["reward_meta"]["latency_main"][x]} for x in layer_info["reward_meta"]["latency_main"] if x.startswith("w")]
                w_latency.sort(key=take_value)
                bottlenecks = []
                if cin_latency[-1]['value'] != cin_latency[-2]['value']:
                    bottlenecks.append({"item": "cin", "value": cin_latency[-1]['value']})
                if cout_latency[-1]['value'] != cout_latency[-2]['value']:
                    bottlenecks.append({"item": "cout", "value": cout_latency[-1]['value']})
                if w_latency[-1]['value'] != w_latency[-2]['value']:
                    bottlenecks.append({"item": "w", "value": w_latency[-1]['value']})
                bottlenecks.sort(key=take_value, reverse=True)
                for b in bottlenecks:
                    if b["value"] <= PE_latency:
                        break
                    if b["item"] == "w":
                        # Compute the uram for w
                        datapack = 8
                        dw = 4 # Four bytes by default
                        width = dw * 8 * datapack
                        depth = workload["params"]["o"] * workload["params"]["i"] * \
                                workload["params"]["p"] * workload["params"]["q"] / datapack
                        uram = est_URAM(width, depth)
                        if cal_c_mem(c_mem) + w_mem + uram < self.cst.hw_cst["URAM"]:
                            configs[layer_info["idx"]]["w_read_mode"] = 1
                            w_mem += uram
                    if b["item"] == "cin" and layer_info["idx"] > 0:
                        # Compute the uram for cin
                        datapack = 8
                        dw = 4 # Four bytes by default
                        width = dw * 8 * datapack
                        depth = workload["params"]["i"] * (workload["params"]["r"] + workload["params"]["p"] - 1) * \
                                (workload["params"]["c"] + workload["params"]["q"] - 1) / datapack
                        uram = est_URAM(width, depth)
                        old_c_mem = copy.deepcopy(c_mem)
                        c_mem[layer_info["idx"]][0] = max(c_mem[layer_info["idx"]][0], uram)
                        c_mem[layer_info["idx"] - 1][1] = max(c_mem[layer_info["idx"] - 1][1], uram)
                        if cal_c_mem(c_mem) + w_mem < self.cst.hw_cst["URAM"]:
                            configs[layer_info["idx"]]["cin_read_mode"] = 3
                            configs[layer_info["idx"] - 1]["cout_write_mode"] = 1
                        else:
                            c_mem = old_c_mem
                    if b["item"] == "cout" and layer_info["idx"] < len(workloads) - 1:
                        # Compute the uram for cout
                        datapack = 8
                        dw = 4
                        width = dw * 8 * datapack
                        depth = workload["params"]["o"] * workload["params"]["r"] * workload["params"]["c"] / datapack
                        uram = est_URAM(width, depth)
                        old_c_mem = copy.deepcopy(c_mem)
                        c_mem[layer_info["idx"]][1] = max(c_mem[layer_info["idx"]][1], uram)
                        c_mem[layer_info["idx"] + 1][0] = max(c_mem[layer_info["idx"] + 1][0], uram)
                        if cal_c_mem(c_mem) + w_mem < self.cst.hw_cst["URAM"]:
                            configs[layer_info["idx"]]["cout_write_mode"] = 1
                            configs[layer_info["idx"] + 1]["cin_read_mode"] = 3
                        else:
                            c_mem = old_c_mem
            return configs, cal_c_mem(c_mem) + w_mem

        def modify_task_configs_prev_array(prev_array, configs):
            prev_workload = prev_array['workloads']
            prev_record = prev_array['record']
            if not configs:
                configs = {}
                for layer_idx in range(len(workloads)):
                    configs[layer_idx] = {"prev_sol": None, "prev_workload": None, "prev_latency": None}
            for layer_idx in range(len(workloads)):
                if layer_idx < len(prev_workload):
                    configs[layer_idx]['prev_workload'] = self.workloads[prev_workload[layer_idx]]
                    configs[layer_idx]['prev_sol'] = prev_record.task_sols[layer_idx]['sol']
                    configs[layer_idx]['prev_latency'] = prev_record.task_sols[layer_idx]['latency']
            return configs

        def one_pass(workloads, design_list, silent, early_stop, search_task_configs):
            # Search the best config for each task
            repeat = True
            repeat_iter = 0
            job_list = []
            while repeat:
                search_tasks = []
                # Single workload task
                for workload in workloads:
                    search_task = SingleTask(design_list[i], workload, self.cst)
                    search_tasks.append(search_task)
                # Modify the first search task, used for multi-acc search
                if search_task_configs:
                    for task_idx in range(len(search_tasks)):
                        search_tasks[task_idx].configs = search_task_configs[task_idx]
                # Silent the tuner if the #worker is greater than 1
                local_silent = silent
                if silent == 0:
                    local_silent = 1 if self.search_config["n_worker"] > 1 else 0
                one_batch_n_job = 0
                for t in search_tasks:
                    for job in job_list:
                        if job['job_hash'] == f'{str(t)}_{repeat_iter}':
                            # Avoid duplicate task
                            continue
                    job_list.append(
                        {'job_hash': f'{str(t)}_{repeat_iter}', 'func': self.tune, \
                         'args': [t, None, local_silent, 0]})
                    one_batch_n_job += 1
                # Fill in enough tasks for the initial population
                #if len(job_list) + one_batch_n_job > self.search_config['genetic_params']['population_size'][1]:
                #    repeat = False
                repeat_iter += 1
                if repeat_iter > 1:
                    repeat = False

            pool = utils.MyExecutor(self.search_config['n_worker'])
            results = pool.exec(job_list)
            init_tasks = []
            for r in results:
                if results[r].valid:
                    init_tasks.append(results[r])

            # Search the single array architecture
            if early_stop != -1:
                # Test if the ideal latency is longer than the early stop threshold.
                ideal_latency = utils.compute_tasks_latency(search_tasks, init_tasks)
                if ideal_latency > early_stop:
                    return best_record

            # Build the multi-workload search task
            search_tasks = []
            for workload in workloads:
                search_task = SingleTask(design_list[i], workload, self.cst)
                search_tasks.append(search_task)
            if search_task_configs:
                for task_idx in range(len(search_tasks)):
                    search_tasks[task_idx].configs = search_task_configs[task_idx]
            search_task = MultiTask(design_list[i], search_tasks, self.cst, fuse=0)
            meta = {"one_gen": one_gen, "xgb_params": self.search_config["xgb_params"]}
            search_record = self.tune(search_task, init_tasks, silent=silent, meta=meta)

            return search_record

        best_record = utils.SearchRecord().reset()
        if prev_array:
            search_task_configs = modify_task_configs_prev_array(prev_array, search_task_configs)
        for i in range(len(design_list)):
            if len(self.workloads) == 1:
                # Single task workload
                search_task = SingleTask(design_list[i], workloads[0], self.cst)
                if search_task_configs:
                    search_task.configs = search_task_configs[0]
                search_record = self.tune(search_task)
                if search_record.valid:
                    search_record.arch_sol = search_record.task_sols[0]['sol']
                    if prev_array:
                        total_latency = 0
                        for task_sol in search_record.task_sols:
                            task_sol['latency'] = task_sol['reward_meta']['latency']['latency_orig']
                            total_latency += task_sol['latency']
                        search_record.latency = total_latency
                best_record.update(search_record, save=1)
            else:
                search_record = one_pass(workloads, design_list, silent, early_stop, search_task_configs)
                if prev_array:
                    total_latency = 0
                    for task_sol in search_record.task_sols:
                        task_sol['latency'] = task_sol['reward_meta']['latency']['latency_orig']
                        total_latency += task_sol['latency']
                    search_record.latency = total_latency
                    if search_record.metric == "latency":
                        search_record.reward = 1 / total_latency
                best_record.update(search_record, save=1)
                if self.search_config['use_uram'] == 1 and "conv" in workloads[0]["tags"]:
                    import logging
                    logger = logging.getLogger('AutoSA-Tuner')
                    logger.info("Search again with URAM...")
                    # For CNN we test if any buffers can be fit on-chip
                    layer_info = []
                    for task_idx in range(len(search_record.task_sols)):
                        task_sol = search_record.task_sols[task_idx]
                        layer_info.append({
                            "idx": task_idx,
                            "CTC": task_sol["CTC"],
                            "reward_meta": task_sol["reward_meta"]["latency"]
                        })
                    # Sort them by CTC ratio
                    def getCTC(elem):
                        return elem["CTC"]
                    layer_info.sort(key=getCTC)
                    #pprint.pprint(layer_info)
                    #exit(0)
                    search_task_configs, uram = modify_task_configs_uram(layer_info, workloads, search_task_configs)
                    # Run the search again with updated search configs
                    search_record = one_pass(workloads, design_list, silent, early_stop, search_task_configs)
                    search_record.cst["URAM"] = uram
                    if prev_array:
                        total_latency = 0
                        for task_sol in search_record.task_sols:
                            task_sol['latency'] = task_sol['reward_meta']['latency']['latency_orig']
                            total_latency += task_sol['latency']
                        search_record.latency = total_latency
                        if search_record.metric == "latency":
                            search_record.reward = 1 / total_latency
                    best_record.update(search_record, save=1)

        return best_record

    def search_fusion_single_acc_customized1(self, design_idx=-1, search_task_configs=None):
        """ This function searches the best single accelerator configuration considering
        the task fusion.
        Note: We assume a linear dependence in the network.
        There are two steps.
        Step 1: Build a candidate pool of all the sub-graphs of interst. Search
        for the best array configurations of these tasks.
        Step 2: Use the candidate tasks in the previous step to kick off the
        evo search. For each array config, use the DP to find the best fusion scheme.
        """
        # Note: Consider FP32 only at 200MHz with 3 DDR ports
        params = {
            "thres_CTC": self.cst.hw_cst["DSP"] / 5 * 2 * 0.2 / (12.8 * 3)
        }

        best_record = utils.SearchRecord().reset()

        design_list = self.designs
        if design_idx != -1:
            # Only search a certain design
            design_idx_list = [design_idx]
        else:
            design_idx_list = list(range(len(self.designs)))

        for i in design_idx_list:
            fusion_candidates = []
            # Enqueue the single-workload tasks
            repeat = True
            repeat_iter = 0
            job_list = []
            while repeat:
                search_tasks = []
                for workload in self.workloads:
                    search_task = SingleTask(design_list[i], workload, self.cst)
                    search_tasks.append(search_task)
                # Modify the first search task, used for multi-acc search
                if search_task_configs:
                    search_tasks[0].configs = search_task_configs
                # Silent the tuner if the #worker is greater than 1
                silent = 1 if self.search_config["n_worker"] > 1 else 0
                one_batch_n_job = 0
                for t in search_tasks:
                    for job in job_list:
                        if job['job_hash'] == f'{str(t)}_{repeat_iter}':
                            # Avoid duplicate task
                            continue
                    job_list.append(
                        {'job_hash': f'{str(t)}_{repeat_iter}', 'func': self.tune, \
                         'args': [t, None, silent, 0]})
                    one_batch_n_job += 1
                # Fill in enough tasks for the initial population
                if len(job_list) + one_batch_n_job > self.search_config['genetic_params']['population_size'][1]:
                    repeat = False
                repeat_iter += 1
            pool = utils.MyExecutor(self.search_config['n_worker'])
            results = pool.exec(job_list)
            init_tasks = []
            for r in results:
                if results[r].valid:
                    init_tasks.append(results[r])

            # Sort the tasks based on the CTC ratio
            network_best_records = {}
            for record in init_tasks:
                if record.task_sols[0]['hash'] in network_best_records:
                    network_best_records[record.task_sols[0]['hash']].update(record)
                else:
                    network_best_records[record.task_sols[0]['hash']] = record

            network_best_records_sorted = []
            comm_bound_ops = []
            for k, v in network_best_records.items():
                network_best_records_sorted.append(v)
            CTC_thres = params["thres_CTC"]
            def takeCTC(elem):
                return elem.ctc
            network_best_records_sorted.sort(key=takeCTC)
            for record in network_best_records_sorted:
                if record.dsp_eff < 0.5:
                    CTC_thres = max(CTC_thres, record.ctc)
                else:
                    break
            for record in network_best_records_sorted:
                if record.ctc <= CTC_thres:
                    comm_bound_ops.append(record)

            # Enqueue the multi-workload tasks
            comm_bound_layers = []
            for layer_idx in range(len(self.workloads)):
                layer = self.workloads[layer_idx]
                for op in comm_bound_ops:
                    if layer["name"] in op.task_names:
                        comm_bound_layers.append({"ctc": op.ctc, "layers": [layer_idx]})

            searched_layers = []
            def hash_layers(layer_ids):
                ret = ""
                for id in layer_ids:
                    params = self.workloads[id]["params"]
                    for k,v in params.items():
                        ret += f"{k}{v}"
                    for tag in self.workloads[id]["tags"]:
                        ret += tag
                return ret

            def find_all_pairs(layer_ids):
                # Find all pairs in the network with the same workload config as the "layer_ids"
                layer_hash = hash_layers(layer_ids)
                ret = []
                for idx in range(len(self.workloads) - (len(layer_ids) - 1)):
                    cmp_layer_ids = list(range(idx, idx + len(layer_ids)))
                    if hash_layers(cmp_layer_ids) == layer_hash:
                        task_names = [self.workloads[i]["name"] for i in cmp_layer_ids]
                        ret.append({"idx": cmp_layer_ids, "names": task_names})
                return ret

            while len(comm_bound_layers) > 0:
                # Sort the list based on the increasing order of CTC
                def takeCTC(elem):
                    return elem["ctc"]
                comm_bound_layers.sort(key=takeCTC)

                # Start with the task with the lowest CTC
                op_to_fuse = comm_bound_layers[0]

                # Fuse it with neighbor layers
                if op_to_fuse['layers'][0] > 0:
                    prev_layers = self.workloads[op_to_fuse['layers'][0] - 1: op_to_fuse['layers'][0] + 1]
                    prev_layers_idx = list(range(op_to_fuse['layers'][0] - 1, op_to_fuse['layers'][0] + 1))
                    unfused_latency = 0
                    for layer in prev_layers:
                        for record in network_best_records_sorted:
                            if layer["name"] in record.task_names:
                                unfused_latency += record.latency
                                break
                    #layer_hash = ''
                    #for idx in prev_layers_idx:
                    #    layer_hash += str(idx)
                    layer_hash = hash_layers(prev_layers_idx)
                    if layer_hash not in searched_layers:
                        searched_layers.append(layer_hash)
                        search_record = self.search_fusion_single_acc_customized2(prev_layers_idx, design_idx=i, search_task_configs=search_task_configs)
                        if search_record.valid:
                            if search_record.latency < unfused_latency:
                                pairs = find_all_pairs(prev_layers_idx)
                                for pair in pairs:
                                    fusion_candidates.append(pair["names"])
                                init_tasks.insert(0, search_record)
                                if search_record.ctc < CTC_thres:
                                    for pair in pairs:
                                        comm_bound_layers.append({"ctc": search_record.ctc, "layers": pair["idx"]})
                if op_to_fuse['layers'][-1] < len(self.workloads) - 1:
                    nxt_layers = self.workloads[op_to_fuse['layers'][-1]: op_to_fuse['layers'][-1] + 2]
                    nxt_layers_idx = list(range(op_to_fuse['layers'][-1], op_to_fuse['layers'][-1] + 2))
                    unfused_latency = 0
                    for layer in nxt_layers:
                        for record in network_best_records_sorted:
                            if layer["name"] in record.task_names:
                                unfused_latency += record.latency
                                break
                    layer_hash = hash_layers(nxt_layers_idx)
                    if layer_hash not in searched_layers:
                        searched_layers.append(layer_hash)
                        search_record = self.search_fusion_single_acc_customized2(nxt_layers_idx, design_idx=i, search_task_configs=search_task_configs)
                        if search_record.valid:
                            if search_record.latency < unfused_latency:
                                pairs = find_all_pairs(nxt_layers_idx)
                                for pair in pairs:
                                    fusion_candidates.append(pair["names"])
                                init_tasks.insert(0, search_record)
                                if search_record.ctc < CTC_thres:
                                    for pair in pairs:
                                        comm_bound_layers.append({"ctc": search_record.ctc, "layers": pair["idx"]})
                # Pop out the op
                comm_bound_layers = comm_bound_layers[1:]

            # Kick off the local search
            search_tasks = []
            for workload in self.workloads:
                search_task = SingleTask(design_list[i], workload, self.cst)
                search_tasks.append(search_task)
            # Modify the first search task, used for multi-acc search
            if search_task_configs:
                search_tasks[0].configs = search_task_configs
            search_task = MultiTask(design_list[i], search_tasks, self.cst, fuse=1)
            import logging
            logger = logging.getLogger('AutoSA-Tuner')
            logger.info(f"fusion candidates: {fusion_candidates}")

            for idx in range(len(fusion_candidates)):
                fusion_candidates[idx] = ''.join(fusion_candidates[idx])
            meta = {'fusion_candidates': fusion_candidates}
            search_record = self.tune(search_task, init_tasks, meta=meta)

            best_record.update(search_record, save=1)

        return best_record

    def search_fusion_single_acc_customized2(self, workload_idx=None, design_idx=-1, search_task_configs=None, silent=0):
        """ This function searches the best single accelerator configuration considering
        the task fusion. All the layers are fused.
        Note: We assume a linear dependence in the network.
        There are two steps.
        Step 1: Build a candidate pool of all the sub-graphs of interst. Search
        for the best array configurations of these tasks.
        Step 2: Use the candidate tasks in the previous step to kick off the
        evo search.
        """
        best_record = utils.SearchRecord().reset()

        design_list = self.designs
        if design_idx != -1:
            # Only search a certain design
            design_idx_list = [design_idx]
        else:
            design_idx_list = list(range(len(self.designs)))
        workloads = [self.workloads[i] for i in workload_idx]

        for i in design_idx_list:
            # Enqueue the single-workload tasks
            repeat = True
            repeat_iter = 0
            job_list = []
            while repeat:
                search_tasks = []
                for workload in workloads:
                    search_task = SingleTask(design_list[i], workload, self.cst)
                    search_tasks.append(search_task)
                # Modify the first search task, used for multi-acc search
                if search_task_configs:
                    search_tasks[0].configs = search_task_configs
                # Modify the last layer
                last_task = copy.deepcopy(search_tasks[-1])
                last_task.fuse = 1
                last_task.last_fuse = 1
                last_task.use_uram = self.search_config["use_uram"]
                if last_task.use_uram:
                    last_task.configs['cin_read_mode'] = 3
                else:
                    last_task.configs['cin_read_mode'] = 2
                last_task.configs['cout_write_mode'] = 0
                last_task.set_aux_func('update_cin_latency', 'update_cin_latency_last')
                if last_task.use_uram == 0:
                    last_task.set_aux_func('update_cin_buf', 'update_cin_buf_bram_last')
                else:
                    last_task.set_aux_func('update_cin_buf', 'update_cin_buf_uram_last')
                search_tasks.append(last_task)

                # Silent the tuner if the #worker is greater than 1
                local_silent = silent
                if silent == 0:
                    local_silent = 1 if self.search_config["n_worker"] > 1 else 0
                one_batch_n_job = 0
                for t in search_tasks:
                    for job in job_list:
                        if job['job_hash'] == f'{str(t)}_{repeat_iter}':
                            # Avoid duplicate task
                            continue
                    job_list.append(
                        {'job_hash': f'{str(t)}_{repeat_iter}', 'func': self.tune, \
                         'args': [t, None, local_silent, 0]})
                    one_batch_n_job += 1
                # Fill in enough tasks for the initial population
                if len(job_list) + one_batch_n_job > self.search_config['genetic_params']['population_size'][1]:
                    repeat = False
                repeat_iter += 1

            pool = utils.MyExecutor(self.search_config['n_worker'])
            results = pool.exec(job_list)
            init_tasks = []
            for r in results:
                if results[r].valid:
                    init_tasks.append(results[r])

            # Local search
            search_tasks = []
            for workload in workloads:
                search_task = SingleTask(design_list[i], workload, self.cst)
                search_tasks.append(search_task)
            # Modify the first search task, used for multi-acc search
            if search_task_configs:
                search_tasks[0].configs = search_task_configs
            search_task = MultiTask(design_list[i], search_tasks, self.cst, fuse=2, use_uram=self.search_config["use_uram"])
            search_record = self.tune(search_task, init_tasks, silent=silent)

            best_record.update(search_record)

        return best_record

    def search_fusion_multi_acc_customized1(self, design_idx=-1, search_task_configs=None, silent=0):
        """ This function searches the best multi-array configuration.
        Run the single array search first.
        Then explore different partitions schemes by setting different DSP utilization threshold.
        For certain threshold, all the layers that achieve beyond the threshold are mapped
        to a homogeneneous systolic array. The rest layers are mapped to separate
        single systolic arrays.
        """
        best_record = utils.SearchRecord().reset()

        params = {
            "non_fuse_repeat": 1, # Run the single-array search for multiple times to stablelize the results
            "n_designs": 4, # Only select the top-k designs for consideration
            "util_interval": 0.1, # DSP utilization interval for generating partition candidates
            "n_partition_candidates": 3, # Only consider the top-k partitioning candidates
            "n_array_max": self.search_config["max_n_array"] # At most #arrays are supported
        }

        import logging
        logger = logging.getLogger('AutoSA-Tuner')

        design_list = self.designs
        if design_idx != -1:
            # Only search a certain design
            design_idx_list = [design_idx]
        else:
            design_idx_list = list(range(len(self.designs)))
        
        '''
        # Single array search        
        design_history = []
        single_array_record = utils.SearchRecord().reset()
        for i in design_idx_list:
            local_record = utils.SearchRecord().reset()
            for repeat in range(params["non_fuse_repeat"]):
                #local_record.update(self.search_non_fusion_single_acc_customized1(design_idx=i, silent=silent, one_gen=True))
                local_record.update(self.search_non_fusion_single_acc_customized1(design_idx=i, silent=silent))
            design_history.append({"idx": i, "record": local_record})
            single_array_record.update(local_record)
        single_array_record.throughput = 1 / single_array_record.latency
        '''
        
        import pickle
        #pickle.dump(design_history, open(f'tmp/design_history_{self.search_config["workload"]}', 'wb'))
        #pickle.dump(single_array_record, open(f'tmp/single_array_record_{self.search_config["workload"]}', 'wb'))
        design_history = pickle.load(open(f'tmp/design_history_{self.search_config["workload"]}', 'rb'))
        single_array_record = pickle.load(open(f'tmp/single_array_record_{self.search_config["workload"]}', 'rb'))        

        '''
        # For the scalability issue, we will only select the top-4 designs
        # as the candidate dataflows for further exploration.
        def take_record_latency(elem):
            return elem["record"].latency
        design_history.sort(key=take_record_latency)
        design_history = design_history[:min(params["n_designs"], len(design_history))]
        design_idx_list = [h["idx"] for h in design_history]                
        logger.info(f"Selected design idx: {design_idx_list}")
        design_list = [self.designs[i] for i in design_idx_list]
        '''

        # Partition initialization        
        # Setting 1: Parition the first x layers to single arrays, and place the rest on a single array        
        # Setting 2: Group layers that are similar together        
        def hash_partition(partition):
            ret = ""
            for p in partition:
                ret += "|"
                ret += ''.join(str(p))
                ret += "|"
            return ret

        partition_candidates = []    

        # Setting 1
        '''
        layer_sols = single_array_record.task_sols
        dsp_eff_list = [sol["DSP_eff"] for sol in layer_sols]
        max_dsp_eff = max(dsp_eff_list)
        op_list = [sol["ops"] for sol in layer_sols]
        total_ops = np.sum(op_list)
        for split_pos in range(1, len(layer_sols)):
            latency_list = []
            # SL array
            for sl_idx in range(split_pos):
                dsp_eff = max_dsp_eff
                t = op_list[sl_idx] / total_ops * dsp_eff
                lat = op_list[sl_idx] / t
                latency_list.append(lat)
            # ML array
            dsp_eff = np.mean(dsp_eff_list[split_pos:])
            t = np.sum(op_list[split_pos:]) / total_ops * dsp_eff
            lat = np.sum(op_list[split_pos:]) / t
            latency_list.append(lat)
            T = 1 / max(latency_list)
            partition = []
            for sl_idx in range(split_pos):
                partition.append([sl_idx])
            partition.append(list(range(split_pos, len(layer_sols))))
            if len(partition) > params["n_array_max"]:
                continue
            partition_candidates.append({
                "idx": len(partition_candidates),
                "partition": partition,
                "hash": hash_partition(partition),
                "throughput": T,
                "n_arrays": len(partition)
            })
        # Sort the partition candidates by throughput
        def take_throughput(elem):
            return elem["throughput"]
        partition_candidates.sort(key=take_throughput, reverse=True)
        logger.info(f"Partition candidates:\n{pprint.pformat(partition_candidates, indent=2)}")
        init_partition_candidates = [i for i in range(min(params["n_partition_candidates"], len(partition_candidates)))]
        '''
        
        # Setting 2
        import statistics
        layer_sols = single_array_record.task_sols
        dsp_eff_list = [sol["DSP_eff"] for sol in layer_sols]
        op_list = [sol["ops"] for sol in layer_sols]
        for i in range(len(dsp_eff_list)):
            print(i, dsp_eff_list[i])   
        import csv
        with open("dsp_eff.csv", "w") as f:
            columns = ["layer", "dsp_eff"]
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
            for i in range(len(dsp_eff_list)):
                data = {
                    "layer": i + 1,
                    "dsp_eff": dsp_eff_list[i]
                }
                writer.writerow(data)

        split_pos_list = []
        # Always split the first layer, therefore start from the second layer
        window = [layer_sols[1]["DSP_eff"], layer_sols[2]["DSP_eff"]]
        stdev_cur = statistics.stdev(window)
        for i in range(3, len(self.workloads)):
            if len(window) > 2 and (
                dsp_eff_list[i] > max(window) * 1.1 or dsp_eff_list[i] * 1.15 < min(window)):
                #print(i, max(window))
                split_pos_list.append(i) # Split before i-th layer
                window = [layer_sols[i]["DSP_eff"]]
            else:
                window.append(layer_sols[i]["DSP_eff"])        
        split_pos_list.insert(0, 1) # Always split the first layer
        split_pos_list.append(len(self.workloads))        
        print(split_pos_list)
        #exit(0)
        max_min_list = [max(dsp_eff_list), min(dsp_eff_list)] 
        #print(max_min_list)
        stdev_max = statistics.stdev(max_min_list)
        #print(stdev_max)
        
        # Compute the mean and stdev        
        def profile_partition(split_pos_list, dsp_eff_list):
            stdev_list = []
            mean_list = []
            mean_ratio_list = []
            for i in range(1, len(split_pos_list)):
                window = [dsp_eff_list[d] for d in range(split_pos_list[i - 1], split_pos_list[i])]                
                mean_list.append(np.mean(window))
                if len(window) > 1:
                    stdev_list.append(statistics.stdev(window))
                else:
                    stdev_list.append(0)
            for i in range(1, len(mean_list)):
                ratio = abs((mean_list[i] - mean_list[i - 1]) / mean_list[i - 1])
                mean_ratio_list.append(ratio)
            return mean_list, stdev_list, mean_ratio_list
        def estimate_partition_throughput(partition, dsp_eff_list, op_list):
            latency = []
            max_dsp_eff = max(dsp_eff_list)
            for p in partition:
                ops = 0                
                for i in p:
                    ops += op_list[i]
                if len(p) == 1:
                    dsp_eff = max_dsp_eff
                else:
                    #dsp_eff = np.mean([dsp_eff_list[i] for i in p])
                    stdev_cur = statistics.stdev([dsp_eff_list[i] for i in p])
                    dsp_eff = (min(dsp_eff_list) - max(dsp_eff_list)) / 2 * (stdev_cur / stdev_max) + max(dsp_eff_list)
                    #dsp_eff = (min(dsp_eff_list) - max(dsp_eff_list)) * (stdev_cur / stdev_max) + max(dsp_eff_list)
                throughput_cur = ops / np.sum(op_list) * dsp_eff
                latency_cur = ops / throughput_cur
                latency.append(latency_cur)
            return 1 / max(latency)
        
        split_pos_list_old = copy.deepcopy(split_pos_list)        
        # Merge 
        mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)
        cur_n_array = len(mean_list)            
        while cur_n_array > 0:
            if cur_n_array <= params["n_array_max"] - 1:
                partition = [[0]]
                for i in range(len(split_pos_list) - 1):
                    partition += [list(range(split_pos_list[i], split_pos_list[i + 1]))]
                throughput = estimate_partition_throughput(partition, dsp_eff_list, op_list)
                duplicate = False
                for p_tmp in partition_candidates:
                    if p_tmp["hash"] == hash_partition(partition):
                        duplicate = True
                        break
                if not duplicate:
                    partition_candidates.append({
                        "idx": len(partition_candidates),
                        "partition": partition,
                        "hash": hash_partition(partition),
                        "throughput": throughput,
                        "n_arrays": len(partition)
                    })
            # Sort the mean_ratio_list and merge the adjacent one with the smallest ratio                
            if cur_n_array > 1:                       
                sort_index = np.argsort(mean_ratio_list)
                array_to_merge_idx = sort_index[0]
                del(split_pos_list[array_to_merge_idx + 1])
                mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)    
                cur_n_array = len(mean_list)
            else:
                cur_n_array -= 1           

        # Split
        split_pos_list = split_pos_list_old
        mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)
        cur_n_array = len(mean_list)
        while cur_n_array <= params["n_array_max"] - 1:
            partition = [[0]]
            for i in range(len(split_pos_list) - 1):
                partition += [list(range(split_pos_list[i], split_pos_list[i + 1]))]
            throughput = estimate_partition_throughput(partition, dsp_eff_list, op_list)
            duplicate = False
            for p_tmp in partition_candidates:
                if p_tmp["hash"] == hash_partition(partition):
                    duplicate = True
                    break
            if not duplicate:
                partition_candidates.append({
                    "idx": len(partition_candidates),
                    "partition": partition,
                    "hash": hash_partition(partition),
                    "throughput": throughput,
                    "n_arrays": len(partition)
                })
            
            #print(stdev_list)
            sort_index = np.argsort(stdev_list)                
            array_to_split_index = sort_index[-1]
            if stdev_list[array_to_split_index] == 0:                    
                break
            # Try different positions
            #print(split_pos_list)
            #print(stdev_list)
            #print(array_to_split_index)
            if split_pos_list[array_to_split_index + 1] - split_pos_list[array_to_split_index] > 2:
                stdev_tmp_list = []                
                for i in range(split_pos_list[array_to_split_index], split_pos_list[array_to_split_index + 1]):
                    dsp_eff_tmp_list = dsp_eff_list[split_pos_list[array_to_split_index]: split_pos_list[array_to_split_index + 1]]                    
                    del(dsp_eff_tmp_list[i - split_pos_list[array_to_split_index]])                    
                    if len(dsp_eff_tmp_list) > 1:
                        stdev_tmp_list.append(statistics.stdev(dsp_eff_tmp_list))
                    else:
                        stdev_tmp_list.append(0)
                        break
                sort_index = np.argsort(stdev_tmp_list)      
                insert = 1
                if sort_index[0] > 0:
                    split_pos_list.insert(array_to_split_index + insert, split_pos_list[array_to_split_index] + sort_index[0])  
                    insert += 1
                if sort_index[0] < len(stdev_tmp_list) - 1:
                    split_pos_list.insert(array_to_split_index + insert, split_pos_list[array_to_split_index] + sort_index[0] + 1)  
                #split_pos_list.insert(array_to_split_index + 1, split_pos_list[array_to_split_index] + sort_index[0] + 1)
            else:
                split_pos_list.insert(array_to_split_index + 1, split_pos_list[array_to_split_index] + 1)
            mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)    
            cur_n_array = len(mean_list)          
        
        #if len(mean_list) >= params["n_array_max"] - 1:
        #    # If the current #array is grater than the maximal array, merge them
        #    cur_n_array = len(mean_list)            
        #    while cur_n_array > 0:
        #        if cur_n_array <= params["n_array_max"] - 1:
        #            partition = [[0]]
        #            for i in range(len(split_pos_list) - 1):
        #                partition += [list(range(split_pos_list[i], split_pos_list[i + 1]))]
        #            throughput = estimate_partition_throughput(partition, dsp_eff_list, op_list)
        #            partition_candidates.append({
        #                "idx": len(partition_candidates),
        #                "partition": partition,
        #                "hash": hash_partition(partition),
        #                "throughput": throughput,
        #                "n_arrays": len(partition)
        #            })
        #        # Sort the mean_ratio_list and merge the adjacent one with the smallest ratio                
        #        if cur_n_array > 1:                       
        #            sort_index = np.argsort(mean_ratio_list)
        #            array_to_merge_idx = sort_index[0]
        #            del(split_pos_list[array_to_merge_idx + 1])
        #            mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)    
        #            cur_n_array = len(mean_list)
        #        else:
        #            cur_n_array -= 1                                
        #else:
        #    # Else, split the array with the highest stdev
        #    cur_n_array = len(mean_list)
        #    while cur_n_array <= params["n_array_max"] - 1:
        #        partition = [[0]]
        #        for i in range(len(split_pos_list) - 1):
        #            partition += [list(range(split_pos_list[i], split_pos_list[i + 1]))]
        #        throughput = estimate_partition_throughput(partition, dsp_eff_list, op_list)
        #        partition_candidates.append({
        #            "idx": len(partition_candidates),
        #            "partition": partition,
        #            "hash": hash_partition(partition),
        #            "throughput": throughput,
        #            "n_arrays": len(partition)
        #        })
        #        
        #        #print(stdev_list)
        #        sort_index = np.argsort(stdev_list)                
        #        array_to_split_index = sort_index[-1]
        #        if stdev_list[array_to_split_index] == 0:                    
        #            break
        #        # Try different positions
        #        if split_pos_list[array_to_split_index + 1] - split_pos_list[array_to_split_index] > 2:
        #            stdev_tmp_list = []                
        #            for i in range(split_pos_list[array_to_split_index], split_pos_list[array_to_split_index + 1]):
        #                dsp_eff_tmp_list = dsp_eff_list[split_pos_list[array_to_split_index]: split_pos_list[array_to_split_index + 1]]                    
        #                del(dsp_eff_tmp_list[i - split_pos_list[array_to_split_index]])                    
        #                if len(dsp_eff_tmp_list) > 1:
        #                    stdev_tmp_list.append(statistics.stdev(dsp_eff_tmp_list))
        #                else:
        #                    stdev_tmp_list.append(0)
        #                    break
        #            sort_index = np.argsort(stdev_tmp_list)      
        #            insert = 1
        #            if sort_index[0] > 0:
        #                split_pos_list.insert(array_to_split_index + insert, split_pos_list[array_to_split_index] + sort_index[0])  
        #                insert += 1
        #            if sort_index[0] < len(stdev_tmp_list) - 1:
        #                split_pos_list.insert(array_to_split_index + insert, split_pos_list[array_to_split_index] + sort_index[0] + 1)  
        #            #split_pos_list.insert(array_to_split_index + 1, split_pos_list[array_to_split_index] + sort_index[0] + 1)
        #        else:
        #            split_pos_list.insert(array_to_split_index + 1, split_pos_list[array_to_split_index] + 1)
        #        mean_list, stdev_list, mean_ratio_list = profile_partition(split_pos_list, dsp_eff_list)    
        #        cur_n_array = len(mean_list)                
        
        #def take_n_array(elem):
        #    return elem["n_arrays"]
        #partition_candidates.sort(key=take_n_array, reverse=True)
        def take_throughput(elem):
            return elem["throughput"]
        partition_candidates.sort(key=take_throughput, reverse=True)
        logger.info(f"Partition candidates:\n{pprint.pformat(partition_candidates, indent=2)}")
        init_partition_candidates = [i for i in range(min(params["n_partition_candidates"], len(partition_candidates)))]
        #pprint.pprint(partition_candidates)        
        #exit(0)

        '''        
        # Internal testing
        partition_candidates = []
        partition = []
        if self.search_config["workload"] == "vgg16":            
            partition.append([0])
            partition.append([1])
            partition.append([2])
            partition.append([3])
            partition.append([4])
            partition.append(list(range(5, len(self.workloads))))            
        elif self.search_config["workload"] == "resnet50":
            partition = []
            partition.append([0])
            partition.append(list(range(1, 10)))  # 2-10
            partition.append(list(range(10, 23))) # 11-23
            partition.append(list(range(23, 40)))  # 24-40
            partition.append(list(range(40, len(self.workloads)))) # 41-end                    
        elif self.search_config["workload"] == "mobilenetv2":
            partition = []
            partition.append([0])            
            partition.append(list(range(1, 2)))
            partition.append(list(range(2, 3)))        
            partition.append(list(range(3, 4)))
            partition.append(list(range(4, 8)))
            partition.append(list(range(8, 14)))        
            partition.append(list(range(14, 22)))
            partition.append(list(range(22, 28)))                
            partition.append(list(range(28, len(self.workloads))))            
        init_partition_candidates = [0]
        partition_candidates.append({
            "idx": len(partition_candidates),
            "partition": partition,            
            "hash": hash_partition(partition),
            "n_arrays": len(partition)
            })
        '''
        
        design_idx_list = [4, 5, 6, 8]
        design_list = [self.designs[i] for i in design_idx_list]

        # Collect the init tasks
        init_tasks = []
        for i in range(len(design_list)):
            job_list = []
            local_silent = silent
            if silent == 0:
                local_silent = 1 if self.search_config["n_worker"] > 1 else 0

            for repeat in range(params["non_fuse_repeat"]):
                search_tasks = []
                for workload in self.workloads:
                    search_task = SingleTask(design_list[i], workload, self.cst)
                    search_tasks.append(search_task)
                for t in search_tasks:
                    for job in job_list:
                        if job['job_hash'] == f'{str(t)}_{repeat}':
                            # Avoid duplicate task
                            continue
                    job_list.append(
                        {'job_hash': f'{str(t)}_{repeat}', 'func': self.tune, \
                         'args': [t, None, local_silent, 0]})

            pool = utils.MyExecutor(self.search_config['n_worker'])
            results = pool.exec(job_list)
            for r in results:
                if results[r].valid:
                    init_tasks.append(results[r])

        # Local search
        search_tasks = []
        for workload in self.workloads:
            search_task = SingleTask(design_list[0], workload, self.cst)
            search_tasks.append(search_task)
        if search_task_configs:
            search_tasks[0].configs = search_task_configs
        search_task = MultiTask(design_list, search_tasks, self.cst, split=1)
        meta = {'partition_candidates': partition_candidates,
                'design_idx_list': design_idx_list,
                'init_partition_candidates': init_partition_candidates,
                "batch_size": self.search_config["batch_size"],
                "use_uram_all": self.search_config["use_uram_all"]}

        '''
        # For internal testing
        import pickle
        #pickle.dump(meta, open('tmp/meta', 'wb'))
        #pickle.dump(init_tasks, open('tmp/init_tasks', 'wb'))
        #exit(0)
        meta = pickle.load(open('tmp/meta', 'rb'))
        init_tasks = pickle.load(open('tmp/init_tasks', 'rb'))
        meta['init_partition_candidates'] = [5]
        design_list = [self.designs[i] for i in meta['design_idx_list']]
        search_tasks = []
        for workload in self.workloads:
            search_task = SingleTask(design_list[0], workload, self.cst)
            search_tasks.append(search_task)
        if search_task_configs:
            search_tasks[0].configs = search_task_configs
        search_task = MultiTask(design_list, search_tasks, self.cst, split=1)
        '''

        search_record = self.tune(search_task, init_tasks, silent=silent, meta=meta)

        best_record.update(search_record)

        return best_record

    def search_fusion_multi_acc_customized2(self, design_idx=-1, search_task_configs=None, silent=0):
        """ This function searches the best multi-array configuration.
        It will periodically schedule the layers onto different systolic arrays.
        """
        best_record = utils.SearchRecord().reset()

        params = {
            "non_fuse_repeat": 1, # Run the single-array search for multiple times to stablelize the results
            "n_designs": 4, # Only select the top-k designs for consideration
            "n_partition_candidates": 3, # Only consider the top-k partitioning candidates
            "n_array_max": self.search_config["max_n_array"] # At most #arrays are supported
        }

        import logging
        logger = logging.getLogger('AutoSA-Tuner')

        design_list = self.designs
        if design_idx != -1:
            # Only search a certain design
            design_idx_list = [design_idx]
        else:
            design_idx_list = list(range(len(self.designs)))
                        
        # Single array search        
        design_history = []
        single_array_record = utils.SearchRecord().reset()
        search_task_configs = {}
        #for i in range(len(self.workloads)):
        #    search_task_configs[i] = {'fix_param': [['r', 1]]}
        for i in design_idx_list:
            local_record = utils.SearchRecord().reset()
            for repeat in range(params["non_fuse_repeat"]):
                local_record.update(\
                    self.search_non_fusion_single_acc_customized1(design_idx=i, silent=silent, one_gen=True))
                    #search_task_configs=search_task_configs))
            design_history.append({"idx": i, "record": local_record})
            single_array_record.update(local_record)
        single_array_record.throughput = 1 / single_array_record.latency                

        # For internal testing
        import pickle
        pickle.dump(design_history, open(f'tmp/design_history_{self.search_config["workload"]}', 'wb'))
        pickle.dump(single_array_record, open(f'tmp/single_array_record_{self.search_config["workload"]}', 'wb'))
        #design_history = pickle.load(open(f'tmp/design_history_{self.search_config["workload"]}', 'rb'))
        #single_array_record = pickle.load(open(f'tmp/single_array_record_{self.search_config["workload"]}', 'rb'))        

        '''
        # For the scalability issue, we will only select the top-4 designs
        # as the candidate dataflows for further exploration.
        def take_record_latency(elem):
            return elem["record"].latency
        design_history.sort(key=take_record_latency)
        design_history = design_history[:min(params["n_designs"], len(design_history))]
        design_idx_list = [h["idx"] for h in design_history]                
        logger.info(f"Selected design idx: {design_idx_list}")
        design_list = [self.designs[i] for i in design_idx_list]
        '''

        # Try all different #array combinations and rank based on the total ideal latency        
        def hash_partition(partition):
            ret = ""
            for p in partition:
                ret += "|"
                ret += ''.join(str(p))
                ret += "|"
            return ret

        partition_candidates = []          
        for n_array in range(2, min(len(self.workloads), params["n_array_max"]) + 1):
            partition = [[] for i in range(n_array)]
            for i in range(len(self.workloads)):
                array_idx = i % n_array
                partition[array_idx].append(i)
            layer_sols = single_array_record.task_sols
            dsp_eff_list = [sol["DSP_eff"] for sol in layer_sols]
            op_list = [sol["ops"] for sol in layer_sols]
            total_ops = np.sum(op_list)
            throughput_list = []
            for i in range(n_array):
                dsp_eff_list_cur = [dsp_eff_list[p] for p in partition[i]]
                dsp_eff_cur = np.mean(dsp_eff_list_cur)                
                op_list_cur = [op_list[p] for p in partition[i]]
                t_cur = np.sum(op_list_cur) / total_ops * dsp_eff_cur
                throughput_list.append(t_cur)
            record_latency = []
            for i in range(n_array):
                op_list_cur = [op_list[p] for p in partition[i]]
                array_latency_cur = [op_cur / throughput_list[i] for op_cur in op_list_cur]
                record_latency.append(array_latency_cur)

            design_latency = 0
            max_round = 0
            for p in partition:
                max_round = max(max_round, len(p))
            for round in range(max_round):
                array_latency = [record_latency[0][round] * self.search_config["batch_size"]]
                setup_latency = [0]
                for array_idx in range(1, n_array):
                    if round >= len(partition[array_idx]):
                        break
                    setup = record_latency[array_idx - 1][round] * 0.2
                    setup_latency.append(setup)                    
                    array_latency.append(max(record_latency[array_idx][round] * self.search_config["batch_size"], array_latency[array_idx - 1]))
                design_latency += (sum(setup_latency) + array_latency[-1])                    
            design_throughput = 1 / design_latency * self.search_config["batch_size"]
            if len(partition) > params["n_array_max"]:
                continue
            partition_candidates.append({
                "idx": len(partition_candidates),
                "partition": partition,
                "hash": hash_partition(partition),
                "throughput": design_throughput,
                "n_arrays": len(partition)
            })                

        def take_throughput(elem):
            return elem["throughput"]
        partition_candidates.sort(key=take_throughput, reverse=True)
        logger.info(f"Partition candidates:\n{pprint.pformat(partition_candidates, indent=2)}")
        init_partition_candidates = [i for i in range(min(params["n_partition_candidates"], len(partition_candidates)))]

        design_idx_list = [4, 5, 6, 8]
        design_list = [self.designs[i] for i in design_idx_list]

        # Collect the init tasks
        init_tasks = []
        for i in range(len(design_list)):
            job_list = []
            local_silent = silent
            if silent == 0:
                local_silent = 1 if self.search_config["n_worker"] > 1 else 0

            for repeat in range(params["non_fuse_repeat"]):
                search_tasks = []
                for workload in self.workloads:
                    search_task = SingleTask(design_list[i], workload, self.cst)
                    search_tasks.append(search_task)
                for t in search_tasks:
                    for job in job_list:
                        if job['job_hash'] == f'{str(t)}_{repeat}':
                            # Avoid duplicate task
                            continue
                    job_list.append(
                        {'job_hash': f'{str(t)}_{repeat}', 'func': self.tune, \
                         'args': [t, None, local_silent, 0]})

            pool = utils.MyExecutor(self.search_config['n_worker'])
            results = pool.exec(job_list)
            for r in results:
                if results[r].valid:
                    init_tasks.append(results[r])

        # Local search
        search_tasks = []
        for workload in self.workloads:
            search_task = SingleTask(design_list[0], workload, self.cst)
            search_tasks.append(search_task)
        if search_task_configs:
            search_tasks[0].configs = search_task_configs
        search_task = MultiTask(design_list, search_tasks, self.cst, split=1)
        meta = {'partition_candidates': partition_candidates,                
                'design_idx_list': design_idx_list,
                'init_partition_candidates': init_partition_candidates,                
                "batch_size": self.search_config["batch_size"]}
        search_record = self.tune(search_task, init_tasks, silent=silent, meta=meta)

        best_record.update(search_record)

        return best_record


================================================
FILE: autosa_scripts/odyssey/main.py
================================================
import argparse
from datetime import datetime
import logging
import numpy as np
import os
import pickle
import concurrent.futures
import json
import pprint

import utils
from tuners import Constraint
from design import Design
from explorer import ArchExplorer

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default="outdir", help="output directory")
    parser.add_argument('--db', type=str, default="db", help="search database")
    parser.add_argument('--use-db', type=int, default=1, help="use database")
    parser.add_argument('--objective', type=str, default="latency", help="optimization target [latency, off_chip_comm, energy, dsp_num]")
    parser.add_argument('--cst', type=str, default="hw_cst", help="hardware constraint")
    parser.add_argument('--stop-after-epochs', type=int, default=-1, help="number of epochs of the unit searching task")
    parser.add_argument('--stop-after-time', type=int, default=-1, help="number of epochs of the unit searching task")
    parser.add_argument('--n-worker', type=int, default=8, help="number of workers for multi-processing")
    parser.add_argument('--designs', type=str, default="designs", help="systolic array design directory")
    parser.add_argument('--design-idx', type=int, default=-1, help="systolic array design index")
    parser.add_argument('--workload', type=str, required=True, help="searching workload")
    # Architecture specific options
    parser.add_argument('--explore-fusion', action="store_true", help="explore layer fusion in a single accelerator")
    parser.add_argument('--explore-multi-acc', action="store_true", help="explore using multiple accelerators")
    parser.add_argument('--explore-programmable', action="store_true", help="explore programmable systolic array")
    parser.add_argument('--multi-array-mode', type=int, default=0, help="execution mode of the generic array in the multi-acc setting")
    parser.add_argument('--use-uram', type=int, default=0, help="use URAM for the intermediate data in the fused array")
    parser.add_argument('--use-uram-all', action="store_true", help="use URAM for all the arrays in the multi-array system")
    parser.add_argument('--method', type=str, default="customized1", help="searching method")
    parser.add_argument('--unit-task-method', type=str, default="genetic", help="unit task searching method")
    #parser.add_argument('--multi-batch', action="store_true", help="use multiple batches in the multi-acc array")
    parser.add_argument('--batch-size', type=int, default=1, help="use multiple batches in the multi-acc array")
    parser.add_argument('--profiling', action="store_true", help="profiling")
    parser.add_argument('--max-n-array', type=int, default=8, help="maximal number of arrays")
    # Algorithm specific options
    parser.add_argument('--xgb-n-gens', type=int, default=5)
    parser.add_argument('--xgb-thres', type=float, default=0.6)
    parser.add_argument('--xgb-thres-adjust', type=float, default=0.4)

    args = parser.parse_args()

    search_obj = args.objective

    # Set up the working directory
    now = datetime.now()
    outdir = args.outdir
    os.makedirs(outdir, exist_ok=True)
    explore_config = ""
    explore_config += "f1" if args.explore_fusion else "f0"
    explore_config += "ma1" if args.explore_multi_acc else "ma0"
    explore_config += "p1" if args.explore_programmable else "p0"
    explore_config += f"mam{args.multi_array_mode}"
    explore_config += f"u{args.use_uram}"
    exp_name = f"O_{args.objective}-W_{args.workload}-C_{explore_config}-T_{now.date()}-{now.time()}"
    outdir = f"{outdir}/{exp_name}"
    os.makedirs(outdir, exist_ok=True)
    logger = utils.init_logger(outdir)

    # Load the hardware constraints
    cst = Constraint(f'cst/{args.cst}.json')

    # Load the workloads
    with open(f'workload/{args.workload}.json') as f:
        data = json.load(f)
    workloads = []
    for workload in data['workloads']:
        workloads.append(workload)

    # Load the designs
    design_dir = args.designs
    os.makedirs(f"{design_dir}/register", exist_ok=True)
    designs = []
    for f in os.listdir(design_dir):
        if f.endswith(".json"):
            with open(f'{design_dir}/{f}', 'r') as json_f:
                desp = json.load(json_f)
            design = Design(f.split(".")[0])
            design.register(desp, f"{design_dir}/register/{design.name}.py")
            designs.append(design)
    def get_design_name(elem):
        return elem.name
    # Sort the designs by names
    designs.sort(key=get_design_name)
    if len(designs) == 0:
        raise RuntimeError("No systolic array design was found.")
    #for design in designs:
    #    print(design.name)

    # Update the search stop criteria
    max_epochs = -1
    max_time = -1
    if args.stop_after_epochs > 0:
        max_epochs = args.stop_after_epochs
    elif args.stop_after_time > 0:
        max_time = args.stop_after_time
    else:
        max_time = 60 # 60 seconds by default

    # Load the search database if existed
    db_file = f'{args.db}/{str(cst)}.db'
    if os.path.exists(db_file) and args.use_db:
        search_db = pickle.load(open(db_file, 'rb'))
        logger.info('Found existing tuning database!')
    else:
        search_db = None

    # Start search
    counter = utils.PerfCounter(logger)
    counter.init_counter("total_search_time")

    search_config = {
        "method": args.method, # [customized1, customized2, exhaustive]
        "n_worker": args.n_worker,
        "unit_task_method": args.unit_task_method, # [exhaustive_pruning, random, sa, bayesian, opentuner, RL]
        "profiling": args.profiling,
        "workload": args.workload,
        "design_idx": args.design_idx,
        "genetic_params": {"population_size": [200, 20]},
        "args": args,
        "search_records_db": {} if search_db == None else search_db,
        "explore_fusion": args.explore_fusion,
        "explore_multi_acc": args.explore_multi_acc,
        "explore_programmable": args.explore_programmable,
        "multi_array_mode": args.multi_array_mode,        
        "use_db": args.use_db,
        "use_uram": args.use_uram,
        "use_uram_all": args.use_uram_all,
        "batch_size": args.batch_size,
        "max_n_array": args.max_n_array,
        "xgb_params": {
            "n_gens": args.xgb_n_gens,
            "thres": args.xgb_thres,
            "thres_adjust": args.xgb_thres_adjust
        }
    }    

    explorer = ArchExplorer(cst, search_obj, max_epochs, max_time, search_config, designs, workloads)
    search_record = explorer.search()

    # Update the database
    search_db = explorer.search_config["search_records_db"]
    if os.path.exists(db_file):
        old_search_db = pickle.load(open(db_file, 'rb'))
        for search_task in search_db:
            if search_task in old_search_db:
                old_search_db[search_task].update(search_db[search_task])
            else:
                old_search_db[search_task] = search_db[search_task]
        pickle.dump(old_search_db, open(db_file, 'wb'))
    else:
        pickle.dump(search_db, open(db_file, 'wb'))

    counter.update_counter("total_search_time")
    counter.print_counter("total_search_time")

    # Display and dump out the search results
    #def print_records(record, num):
    #    num += 1
    #    if num > 10:
    #        return
    #    while record.records:
    #        print(record.task_names, len(record.records))
    #        for r in record.records:
    #            print_records(r, num)
    #print_records(search_record, 0)

    logger.info(f'{search_record.to_str()}')
    with open(f'{outdir}/history.log', 'w') as f:
        f.write(search_record.to_str())


================================================
FILE: autosa_scripts/odyssey/requirements.txt
================================================
bayesian-optimization==1.1.0
certifi==2021.10.8
dill @ file:///home/conda/feedstock_root/build_artifacts/dill_1623610058511/work
joblib @ file:///tmp/build/80754af9/joblib_1635411271373/work
mkl-fft==1.3.1
mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186066731/work
mkl-service==2.4.0
multiprocess @ file:///home/conda/feedstock_root/build_artifacts/multiprocess_1623774446079/work
numpy @ file:///tmp/build/80754af9/numpy_and_numpy_base_1634095651905/work
pathos @ file:///home/conda/feedstock_root/build_artifacts/pathos_1623937754918/work
pox @ file:///home/conda/feedstock_root/build_artifacts/pox_1623773830989/work
ppft @ file:///home/conda/feedstock_root/build_artifacts/ppft_1623774454681/work
scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1635187048948/work
scipy @ file:///tmp/build/80754af9/scipy_1630606796912/work
six @ file:///tmp/build/80754af9/six_1623709665295/work
threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work
xgboost==1.3.3


================================================
FILE: autosa_scripts/odyssey/scripts/compute_network_info.py
================================================
import csv
import json

csv_columns = ["Layer", "Name", "i", "o", "r", "c", "p", "q", "ops", "parallelism", "ai", "parallelism_norm", "ai_norm",
               "throughput_free", "dsp_eff_free", "kernel", "latency_fixed", "dsp_eff_fixed", "throughput", "throughput_norm"]
dict_data = []
with open("../workload/resnet50.json", "r") as f:
    network_data = json.load(f)
#for layer in network_data["workloads"]:
parallelism_min = float("inf")
ai_min = float("inf")
for idx in range(len(network_data["workloads"])):
    layer = network_data["workloads"][idx]
    i, o, r, c, p, q = layer["params"]["i"], layer["params"]["o"], layer["params"]["r"], layer["params"]["c"], \
                       layer["params"]["p"], layer["params"]["q"]
    dict_data.append({
        'Layer': idx + 1,
        'Name': layer["name"],
        'i': i, 'o': o, 'r': r, 'c': c, 'p': p, 'q': q,
        "ops": i*o*r*c*p*q, "parallelism": o*r*c, "ai": i*o*r*c*p*q/(i*(r+p-1)*(c+q-1)+o*r*c+i*o*p*q)
    })
    parallelism_min = min(parallelism_min, dict_data[-1]["parallelism"])
    ai_min = min(ai_min, dict_data[-1]["ai"])
# normalize
for data in dict_data:
    data["parallelism_norm"] = data["parallelism"] / parallelism_min
    data["ai_norm"] = data["ai"] / ai_min

# load the tuning log
log_file = "/home/jaywang/AutoSA_Tuner/refactor2/outdir/O_latency-W_resnet50-C_f0ma0p0mam0u0-T_2021-07-02-11:52:32.005352/tuning.log"
throughput_min = float("inf")
with open(log_file, "r") as f:
    lines = f.readlines()
    total_layer = 0
    for line_idx in range(len(lines)):
        line = lines[line_idx]    
        if line.find("DSP_eff") != -1:
            dsp_eff = float(line.strip().split(":")[-1].strip(","))
            dict_data[total_layer]["dsp_eff_fixed"] = dsp_eff
            latency = float(lines[line_idx + 2].strip().split(":")[-1].strip(","))
            dict_data[total_layer]["latency_fixed"] = latency
            dict_data[total_layer]["throughput"] = dict_data[total_layer]["ops"] / dict_data[total_layer]["latency_fixed"]
            throughput_min = min(throughput_min, dict_data[total_layer]["throughput"])
            total_layer += 1
            if total_layer >= len(dict_data):
                break

# normalize
for data in dict_data:
    data["throughput_norm"] = data["throughput"] / throughput_min

with open("../tmp/resnet_info.csv", "w") as csvfile:
    write = csv.DictWriter(csvfile, fieldnames=csv_columns)
    write.writeheader()
    for data in dict_data:
        write.writerow(data)

================================================
FILE: autosa_scripts/odyssey/scripts/grid_search_xgb_params.py
================================================
import os
import subprocess
import re
import pprint

'''
for model_gens in [5, 10, 20, 50]:
    for xgb_thres in [0.2, 0.4, 0.6, 0.8]:
        #for xgb_thres_adjust in [0.2, 0.4, 0.6, 0.8]:
        # data1
        #for xgb_thres_adjust in [0.2, 0.4]: 
        # data2
        for xgb_thres_adjust in [0.6, 0.8]:
            # Call the python command
            cmd = f"python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=4 --xgb-n-gens={model_gens} --xgb-thres={xgb_thres} --xgb-thres-adjust={xgb_thres_adjust}"
            #os.system(f"python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=4 --xgb-n-gens={model_gens} --xgb-thres={xgb_thres} --xgb-thres-adjust={xgb_thres_adjust}")
            #print(cmd)
            process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
            output, error = process.communicate()
'''

# Collect the best
basepath = "./outdir/"
prjs = os.listdir(basepath)
prjs.sort()
#print(prjs)

results = []
prj_idx = 0
for model_gens in [5, 10, 20, 50]:
    for xgb_thres in [0.2, 0.4, 0.6, 0.8]:        
        for xgb_thres_adjust in [0.6, 0.8]:
            with open(f"./outdir/{prjs[prj_idx]}/tuning.log") as f:
                lines = f.readlines()
                rewards = []
                for line in lines:
                    if line.find("new best reward") != -1:                        
                        epoch = re.search(r"Epoch (.+?):", line).group(1)
                        latency = re.search(r"\((.+?)\)", line).group(1)
                        rewards.append({"epoch": int(epoch), "latency": float(latency)})
                results.append({"configs": [model_gens, xgb_thres, xgb_thres_adjust], "rewards": rewards, "prj": prjs[prj_idx]})
            prj_idx += 1

# Sort the results
def takeBestReward(elem):
    return elem["rewards"][-1]["latency"]
results.sort(key=takeBestReward)
pprint.pprint(results)

================================================
FILE: autosa_scripts/odyssey/scripts/img2col.py
================================================
import json

#with open('workload/vgg16.json') as f:
#with open('workload/resnet50.json') as f:
with open('workload/mobilenetv2.json') as f:
    data = json.load(f)

for layer in data["workloads"]:
    i, o, r, c, p, q = layer["params"]["i"], layer["params"]["o"], layer["params"]["r"], \
                       layer["params"]["c"], layer["params"]["p"], layer["params"]["q"]
    gemm_i = o
    gemm_j = r * c
    gemm_k = i * p * q
    layer["params"] = {"i": gemm_i, "j": gemm_j, "k": gemm_k}
    layer["tags"] = ["gemm"]


#with open("workload/vgg16_img2col.json", "w") as f:
#with open("workload/resnet50_img2col.json", "w") as f:
with open("workload/mobilenetv2_img2col.json", "w") as f:
    json.dump(data, f, indent=2)

================================================
FILE: autosa_scripts/odyssey/scripts/run_arch1.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
for design_idx in 1 4 7 10 13 16 19 22 25 28
do
    python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=$design_idx
    python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=$design_idx
    python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=$design_idx
done
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_arch1_free.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
for design_idx in 1 4 7 10 13 16 19 22 25 28
do
    #for layer_idx in {1..49}
    #do
    #    python main.py --workload=resnet50_$layer_idx --stop-after-time=10 --use-db=0 --design-idx=$design_idx
    #done    
    for layer_idx in {1..36}
    do
        python main.py --workload=mobilenetv2_$layer_idx --stop-after-time=10 --use-db=0 --design-idx=$design_idx
    done    
    for layer_idx in {1..13}
    do
        python main.py --workload=vgg16_$layer_idx --stop-after-time=10 --use-db=0 --design-idx=$design_idx
    done    
done
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_arch1_ml_cmp.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
#rm -rf tmp/*
for design_idx in 1 4 7 10 13 16 19 22 25 28
do
    #python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --design-idx=$design_idx
    python main.py --workload=resnet50 --stop-after-time=15 --use-db=0 --n-worker=32 --design-idx=$design_idx
done
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_arch2.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*

python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8
python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8
python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8

python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16
python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16
python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16

#python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24
python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24
python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24

cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_arch3.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*

#python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8
#python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8
#python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8

python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8 --batch-size=16
python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8 --batch-size=16
python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized2 --max-n-array=8 --batch-size=16

cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_arch4.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
#rm -rf tmp/*

#python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8
#python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8
#python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8

#python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16
#python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16
#python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16

#python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24
#python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24
#python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=24

python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16 --use-uram-all
python main.py --workload=resnet50 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=8 --use-uram-all
python main.py --workload=mobilenetv2 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16 --use-uram-all
python main.py --workload=vgg16 --stop-after-time=10 --use-db=0 --n-worker=32 --explore-multi-acc --explore-fusion --method=customized1 --max-n-array=16

cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_dataflow_cmp_cnn.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
#for design_idx in {0..29}
for design_idx in 6 7 8 15 16 17 27 28 29
do
    for layer_idx in {1..13} 
    do    
        python main.py --workload=vgg16_$layer_idx --stop-after-time=10 --use-db=0 --unit-task-method=genetic --design-idx=$design_idx --profiling
    done
done
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_dataflow_cmp_mm.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
#for design_idx in {0..17}
for design_idx in 0
#for design_idx in {14..14}
#for design_idx in 6 7 8 12 13 14 15 16 17
do
    #python main.py --workload=mm --stop-after-time=10 --use-db=0 --unit-task-method=genetic --design-idx=$design_idx --profiling
    # Solver cmp
    python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --design-idx=$design_idx --profiling 
    # Imperfect pruning
    #python main.py --workload=mm --stop-after-time=10 --use-db=0 --unit-task-method=genetic --design-idx=$design_idx --profiling --objective=off_chip_comm
done

cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_dataflow_cmp_mm_energy.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
for design_idx in {0..17}
do    
    python main.py --workload=mm --stop-after-time=10 --use-db=0 --unit-task-method=genetic --design-idx=$design_idx --objective=energy --profiling
done

cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_img2col_single.sh
================================================
#!/bin/bash

cd ..
python main.py --workload=vgg16_img2col --stop-after-time=10 --use-db=0 --n-worker=32
python main.py --workload=resnet50_img2col --stop-after-time=10 --use-db=0 --n-worker=32
python main.py --workload=mobilenetv2_img2col --stop-after-time=10 --use-db=0 --n-worker=32
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_method_cmp.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
for design_idx in {0..17}
do
    python main.py --workload=mm --stop-after-time=300 --use-db=0 --unit-task-method=genetic --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-time=300 --use-db=0 --unit-task-method=random --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-time=300 --use-db=0 --unit-task-method=random_pruning --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-epoch=150000 --use-db=0 --unit-task-method=annealing --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-epoch=300 --use-db=0 --unit-task-method=bayesian --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-time=300 --use-db=0 --unit-task-method=open_tuner --profiling --design-idx=$design_idx
    python main.py --workload=mm --stop-after-epoch=50000 --use-db=0 --unit-task-method=RL --profiling --design-idx=$design_idx
done
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_metric_cmp.sh
================================================
#!/bin/bash

cd ..
rm -rf outdir/*
rm -rf tmp/*
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=0
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=1
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=2
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=3 --objective=off_chip_comm
python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=3 --objective=dsp_num
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=4
#python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --profiling --design-idx=5
cp -r outdir/* tmp/
cd -


================================================
FILE: autosa_scripts/odyssey/scripts/run_mutation_cmp.sh
================================================
#!/bin/bash

# Use solver by default
# Set epsilon to 0 when only using the factorization mutation
cd ..
rm -rf outdir/*
rm -rf tmp/*
python main.py --workload=mm --stop-after-time=20 --use-db=0 --unit-task-method=genetic --design-idx=3 --profiling
cp -r outdir/* tmp/
cd -

================================================
FILE: autosa_scripts/odyssey/scripts/split_cnn_layers.py
================================================
import csv
import json

#network = "resnet50"
network = "mobilenetv2"
with open(f"../workload/{network}.json", "r") as f:
    network_data = json.load(f)
layer_idx = 1
for layer in network_data["workloads"]:
    data = {}
    data["workloads"] = [layer]
    with open(f"../workload/{network}_{layer_idx}.json", "w") as f:
        json.dump(data, f, indent=4)
    layer_idx += 1

================================================
FILE: autosa_scripts/odyssey/search_task.py
================================================
import json
import random
import numpy as np
import bisect

import utils
from design import Design

class SingleTask(object):
    """ Single workload searching task.
    """
    def __init__(self, design, workload, hw_cst):
        self.design = design
        self.workload = workload

        self.hw_cst = hw_cst
        self.fre = 300 # 300 MHz
        self.dw = 4 # bytes
        self.dt = "float"
        self.fuse = 0
        self.last_fuse = 0 # the last fusion task in the network
        self.use_uram = 0
        self.serialize = 0
        # Fixed architecture solution
        self.arch_sol = None
        self.arch_cst = None
        self.arch_feature = None
        self.fixed = 0        
        # Other configs
        self.configs = {}
        self.aux_funcs = {}

    def __repr__(self):
        #ret = f't_{self.workload["name"]}_'
        ret = ""
        for param in self.workload["params"]:
            ret += param            
            ret += "_"
            ret += f'{self.workload["params"][param]}'
        ret += f'_d_{self.design.name}'
        ret += f'_cst_{self.hw_cst}'
        ret += f'_f_{self.fuse}{self.last_fuse}'
        ret += f'_u_{self.use_uram}'
        ret += f'_s_{self.serialize}'
        if self.fixed == 1:
            ret += f'_fixed_'
            for k, v in self.arch_sol.items():
                ret += f'{k}{v}'
        if len(self.configs) > 0:
            ret += f'_config_'
            for k, v in self.configs.items():
                if k == "fix_param":
                    ret += "fix_param_"
                    for p_pair in v:
                        ret += p_pair[0]
                        ret += "_"
                        ret += str(p_pair[1])
                elif k == "equate_params":
                    ret += "equate_params_"
                    for p_pair in v:
                        ret += p_pair[0]
                        ret += "_"
                        ret += p_pair[1]
                elif k == "prev_workload":
                    ret += "prev_workload_"
                    ret += self.configs['prev_workload']['name']
                elif k == "prev_sol":
                    ret += "prev_sol_"
                    for p in self.configs['prev_sol']:
                        ret += p
                        ret += "_"
                        ret += str(self.configs['prev_sol'][p])
                elif k == "prev_latency":
                    ret += "prev_latency_"
                    ret += str(self.configs['prev_latency'])
                else:
                    ret += f'{k}{v}'

        return ret

    def adjust_params(self, params):
        """ Adjust the parameters based on its contraints.
        """
        def filter_non_power_of_two(x):
            if np.log2(x) != int(np.log2(x)):
                return True
            return False
        
        # Making all factors to be even numbers to have more divisors
        #for p, param in self.design.params_config["tunable"].items():
        #    params[p] = int(np.ceil(params[p] / 2) * 2)        
        for p in params:
            params[p] = int(params[p])

        # Making all divisor factors to be divisors of the dependent variable
        for p, param in self.design.params_config["tunable"].items():
            #print(param)
            if "divisors" in param:
                if "tags" in param and "power_of_two" in param["tags"]:
                    choices = utils.get_divisors(int(params[param["divisors"][0]]), filter_non_power_of_two)
                else:
                    choices = utils.get_divisors(int(params[param["divisors"][0]]), None)
                idx = bisect.bisect(choices, params[p])
                if idx >= len(choices):
                    idx -= 1
                if idx > 1:
                    if abs(choices[idx - 1] - params[p]) < abs(choices[idx] - params[p]):
                        idx -= 1
                #print(params[param["divisors"][0]])
                #print("idx", idx)
                #print("len", len(choices))
                params[p] = choices[idx]

        # Adjust the fixed parameters        
        if 'fix_param' in self.configs:
            for fix_p in self.configs['fix_param']:
                for p, param in self.design.params_config["tunable"].items():                
                    if p.startswith(fix_p[0]):
                        params[p] = int(fix_p[1])
        if 'equate_params' in self.configs:
            for p_pair in self.configs['equate_params']:
                params[p_pair[1]] = params[p_pair[0]]

        return params

    def generate_random_sample(self):
        """ Generate a random sample in the design space.
        """
        workload_params = {}
        for param in self.workload["params"]:
            workload_params[param] = self.workload["params"][param]
        return self.design.random_sampling(workload_params)        

    def check_arch_legality(self, arch_features):
        """ Check if the current architecture is legal.
        """
        if self.fixed == 0:
            return True
        # dims
        for idx in range(len(arch_features['dims'])):
            if arch_features['dims'][idx] > self.arch_cst['dims'][idx]:
                return False
        # SIMD
        if arch_features['SIMD'] > self.arch_cst['SIMD']:
            return False
        # data pack
        for arr in arch_features['data_pack']:
            for idx in range(len(arch_features['data_pack'][arr])):
                if arch_features['data_pack'][arr][idx] > self.arch_cst['data_pack'][arr][idx]:
                    return False
        # resource usage
        for module in arch_features['resource']:
            if module.endswith("unit_memory"):
                if arch_features["resource"][module] > self.arch_cst['resource'][module]:
                    return False

        return True

    def adjust_latency_buffer(self, latency, latency_meta, params):
        """ Adjust latency and for customized search tasks.
        cin_read_mode:
        0: normal ping-pong mode, no need to adjust
        1: load cin one time from the external memory
        2: load cin from on-chip BRAM buffer
        3: load cin from on-chip URAM buffer
        cout_write_mode:
        0: write to external memory
        1: write to on-chip buffer
        w_read_mode:
        0: normal ping-pong mode, no need to adjust
        1: load w from on-chip URAM buffer
        Note: Only works for kernel4
        """
        if ('cin_read_mode' not in self.configs) or ('cout_write_mode' not in self.configs):
            return latency, latency_meta

        """
        Latency prologue
        """        
        w_latency_list = []
        for item, value in latency_meta["latency_prologue"].items():
            if item.startswith('w'):
                w_latency_list.append({"item": item, "value": value})
        cin_latency_list = []
        for item, value in latency_meta['latency_prologue'].items():
            if item.startswith('cin'):
                cin_latency_list.append({"item": item, "value": value})
        # Sort the latency list by item names
        def take_item(elem):
            return elem['item']
        w_latency_list.sort(key=take_item)
        cin_latency_list.sort(key=take_item)

        w_latency = 0
        if 'w_read_mode' not in self.configs or (self.configs['w_read_mode'] == 0):
            for w in w_latency_list:
                w_latency = max(w_latency, w['value'])        
        elif self.configs['w_read_mode'] == 1:
            w_latency_list = w_latency_list[:-1]
            for w in w_latency_list:
                w_latency = max(w_latency, w['value'])

        cin_latency = 0
        if self.configs['cin_read_mode'] == 0:            
            for cin in cin_latency_list:
                cin_latency = max(cin_latency, cin['value'])
        if self.configs['cin_read_mode'] == 1:
            # Modify the cin latency            
            for cin in cin_latency_list:
                cin_latency = max(cin_latency, cin['value'])                            
            cin_latency = self.call_aux_func('update_cin_latency')(cin_latency, self, params)            
        elif self.configs['cin_read_mode'] == 2:                        
            pass
        elif self.configs['cin_read_mode'] == 3:
            # Peel off the last one accessing the DRAM
            cin_latency_list = cin_latency_list[:-1]            
            for cin in cin_latency_list:
                cin_latency = max(cin_latency, cin['value'])        
        latency_prologue = max(w_latency, cin_latency)

        """
        Latency main
        """
        cout_latency_list = []
        for item, value in latency_meta['latency_main'].items():
            if item.startswith('cout'):
                cout_latency_list.append({"item": item, "value": value})        
        w_latency_list = []
        for item, value in latency_meta['latency_main'].items():
            if item.startswith('w'):
                w_latency_list.append({"item": item, "value": value})                        
        cin_latency_list = []
        for item, value in latency_meta['latency_main'].items():
            if item.startswith('cin'):
                cin_latency_list.append({"item": item, "value": value})        
        cout_latency_list.sort(key=take_item)  
        w_latency_list.sort(key=take_item)  
        cin_latency_list.sort(key=take_item)  

        #latency_main = max(latency_meta['latency_main']['PE_latency'], w_latency)
        latency_main = latency_meta['latency_main']['PE_latency']
        w_latency = 0
        if 'w_read_mode' not in self.configs or (self.configs['w_read_mode'] == 0):
            for w in w_latency_list:
                w_latency = max(w_latency, w['value'])            
        else:
            w_latency_list = w_latency_list[:-1]
            for w in w_latency_list:
                w_latency = max(w_latency, w['value'])

        cin_latency = 0
        if self.configs['cin_read_mode'] == 0:            
            for cin in cin_latency_list:
                cin_latency = max(cin_latency, cin['value'])            
        elif self.configs['cin_read_mode'] == 1:
            pass
        elif self.configs['cin_read_mode'] == 2:
            pass
        elif self.configs['cin_read_mode'] == 3:
            # Peel off the last one accessing the DRAM
            cin_latency_list = cin_latency_list[:-1]            
            for cin in cin_latency_list:
                cin_latency = max(cin_latency, cin['value'])
        
        cout_latency = 0        
        if self.configs['cout_write_mode'] == 0:            
            for cout in cout_latency_list:
                cout_latency = max(cout_latency, cout['value'])            
        elif self.configs['cout_write_mode'] == 1:
            # Peel off the last one accessing the DRAM
            cout_latency_list = cout_latency_list[:-1]            
            for cout in cout_latency_list:
                cout_latency = max(cout_latency, cout['value'])
        latency_main = max(latency_main, cin_latency, w_latency, cout_latency)
        
        """
        Latency epilogue
        """
        cout_latency_list = []
        for item, value in latency_meta['latency_epilogue'].items():
            if item.startswith('cout'):
                cout_latency_list.append({"item": item, "value": value})        
        cout_latency_list.sort(key=take_item)

        cout_latency = 0
        if self.configs['cout_write_mode'] == 0:            
            for cout in cout_latency_list:
                cout_latency = max(cout_latency, cout['value'])           
        elif self.configs['cout_write_mode'] == 1:
            # Peel off the last one accessing the DRAM
            cout_latency_list = cout_latency_list[:-1]        
            for cout in cout_latency_list:
                cout_latency = max(cout_latency, cout['value'])
        latency_epilogue = cout_latency

        #print(latency_prologue, latency_main, latency_epilogue)
        if self.fuse == 1 and self.last_fuse == 1:            
            n_iter = np.ceil(self.workload['params']['r'] / params['r_t1']) * \
                     np.ceil(self.workload['params']['c'] / params['c_t1'])
            latency = n_iter * (latency_prologue + latency_main / n_iter + latency_epilogue) * n_iter
        else:
            latency = latency_prologue + latency_main + latency_epilogue
        
        latency_meta = {
            "latency_prologue": latency_prologue,
            "latency_main": latency_main,
            "latency_epilogue": latency_epilogue
        }

        return latency, latency_meta

    def adjust_latency_multi_acc(self, latency, latency_meta, params):
        """ Adjust latency for multi-acc setting
        """
        # Update the setup latency
        if ('prev_workload' not in self.configs) or ('prev_sol' not in self.configs) or \
           ('prev_latency' not in self.configs):
            return latency
        
        prev_workload = self.configs['prev_workload']
        prev_sol = self.configs['prev_sol']
        prev_latency = self.configs['prev_latency']
        o1 = prev_workload["params"]['o']
        tr1 = min(prev_sol['r_t1'], prev_workload["params"]['r'])
        tc1 = min(prev_sol['c_t1'], prev_workload["params"]['c'])
        tr1_post = tr1
        tc1_post = tc1
        for tag in prev_workload["tags"]:
            if tag.startswith("maxpool"):
                stride = int(tag.split('_')[-1])
                tr1_post /= stride
                tc1_post /= stride
        tr1_post = max(int(tr1_post), 1)
        tc1_post = max(int(tc1_post), 1)

        tr2 = min(params["r_t1"], self.workload["params"]["r"])
        tc2 = min(params["c_t1"], self.workload["params"]["c"])
        k = self.workload["params"]["p"]
        data_pack = params["i_t2"]

        c0 = np.ceil((tr2 + k - 1) / tr1_post)
        c1 = np.ceil((tc2 + k - 1) / tc1_post)
        trp = min(c0 * tr1, prev_workload["params"]["r"])
        tcp = min(c1 * tc1, prev_workload["params"]["c"])
        #if (prev_sol["r_t1"] == params["r_t1"]) and \
        #   (prev_sol["c_t1"] == params["c_t1"]):
        #    tri = np.ceil(params["i_t1"] / prev_sol["o_t1"]) * prev_sol["o_t1"]
        #    setup = prev_latency / (np.ceil(prev_workload["params"]['o'] / tri))
        #else:
        setup = prev_latency / (np.ceil(prev_workload["params"]["r"] / trp) * np.ceil(prev_workload["params"]["c"] / tcp))
        
        latency_meta = {
            "latency_orig": latency
        }

        return latency + setup, latency_meta

    def adjust_latency(self, latency, latency_meta, params):
        """ Adjust latency and for customized search tasks.            
        """
        adjust_buffer = False
        adjust_multi_acc = False
        for key in ['cin_read_mode', 'cout_write_mode', 'w_read_mode']:
            for config_key in self.configs:
                if key == config_key:
                    adjust_buffer = True
                    break
        if adjust_buffer:
            latency, latency_meta = self.adjust_latency_buffer(latency, latency_meta, params)
            
        for key in ['prev_workload', 'prev_sol', 'prev_latency']:
            for config_key in self.configs:
                if key == config_key:
                    adjust_multi_acc = True
                    break
        if adjust_multi_acc:
            latency, latency_meta = self.adjust_latency_multi_acc(latency, latency_meta, params)
                    
        return latency, latency_meta
    
    def adjust_resource(self, resource, resource_meta, params):
        """ Update the cin buffer for fused design.
        """
        if 'update_cin_buf' in self.aux_funcs:
            def est_BRAM18K(ele_size, ele_num, pack):
                #return np.ceil(ele_size * 8 * pack / 18) * np.ceil(ele_num / pack / 1024)
                return np.ceil(ele_size * 8 * pack / 36) * np.ceil(ele_num / pack / 512)

            if self.use_uram == 0:
                # Update cin_buf
                for item in resource_meta:
                    if item.startswith("cin"):
                        cin_buf_size = est_BRAM18K(resource_meta[item]['ele_size'], resource_meta[item]['buf_size'], resource_meta[item]['data_pack_factor'])
                        cin_buf_num = resource_meta[item]['num']
                        break
                resource["BRAM18K"] -= (cin_buf_size * cin_buf_num)
                cin_buf_size = max(self.call_aux_func('update_cin_buf')(self, params, resource_meta[item]['ele_size'] * 8 * resource_meta[item]['data_pack_factor'], resource_meta[item]['buf_size'] / resource_meta[item]['data_pack_factor']), cin_buf_size)
                resource["BRAM18K"] += (cin_buf_size * cin_buf_num)
            else:
                # Compute cin_buf
                uram = resource["URAM"]
                for item in resource_meta:
                    if item.startswith("cin"):
                        data_pack = resource_meta[item]['data_pack_factor']                        
                        break
                uram = max(self.call_aux_func('update_cin_buf')(self, params, data_pack) * 2, uram)
                resource["URAM"] = uram

        return resource

    def compute_arch_cst(self, params):
        arch_cst = self.design.compute_arch_cst(params)
        params = self.design.infer_params(params)
        if params:
            if not self.design.bound_check(params):
                arch_cst = None
            else:
                resource, resource_meta = self.design.est_resource(params)
                if len(self.configs) > 0:
                    resource = self.adjust_resource(resource, resource_meta, params)
                arch_cst['resource'] = resource
        else:
            arch_cst = None

        return arch_cst

    def evaluate(self, params, metric="latency"):
        if metric not in ["latency", "off_chip_comm", "energy", "dsp_num"]:
            raise RuntimeError(f"Not supported metric: {metric}")

        params = self.design.infer_params(params)
        if params:
            if not self.design.bound_check(params):                
                return 0, None, None                
            latency, latency_meta = self.design.est_latency(params)
            if len(self.configs) > 0:
                latency, latency_meta = self.adjust_latency(latency, latency_meta, params)            
            if self.fixed == 1:
                # Check the architecture constraints
                arch_cst_cur = self.compute_arch_cst(params)
                if not self.check_arch_legality(arch_cst_cur):
                    return 0, None, None                        
                resource = self.arch_cst['resource']
            else:
                resource, resource_meta = self.design.est_resource(params)
                if len(self.configs) > 0:
                    resource = self.adjust_resource(resource, resource_meta, params)                

            # Compute the other activity
            activity = self.design.est_activity(params)

            if metric == "latency":                
                if latency:
                    return 1 / latency, resource, {'latency': latency_meta, 'activity': activity}
                else:
                    return 0, None, None
            elif metric == "off_chip_comm":
                if activity:
                    latency_meta['latency'] = latency
                    return 1 / activity["off_chip_acc_num"], resource, {'latency': latency_meta, 'activity': activity}
                else:
                    return 0, None, None
            elif metric == "energy":
                if activity:
                    latency_meta['latency'] = latency
                    energy = self.compute_energy(activity)
                    return 1 / energy, resource, {'latency': latency_meta, 'activity': activity}
                else:
                    return 0, None, None
            elif metric == "dsp_num":
                if activity:
                    latency_meta['latency'] = latency
                    return resource["DSP"], resource, {'latency': latency_meta, 'activity': activity}
                else:
                    return 0, None, None
        else:
            return 0, None, None        

    def compute_energy(self, activity):
        """ Estimate the energy consumption of the design.
        """           
        '''
        def est_static_power(x, fre=300):
            """
            returns in Watts
            """
            x = x * 100
            return (6.72 - 0.307 * x + 7.24 * 1e-3 * x * x) * (fre / 300)

        # Default values (W at 300MHz)
        res_unit_power = {
            "BRAM18K": 0.0005033482143,
		    "DSP": 0.0008828125
        }
        # Compute the unit transaction energy
        res_unit_energy = {
		    "BRAM18K": res_unit_power["BRAM18K"] / (300 * 1e6) / 2 * 1e12,
		    "DSP": res_unit_power["DSP"] / (300 * 1e6) * 1e12 * 5 # FP32
	    }

        # DRAM default value
        dram_unit_energy = 427.9 # (pJ) 16-bit 2GB DDR3 at 100MHz (from Wang HPCA)
        # Scale the value 
        dram_unit_energy *= self.dw / 2
        hop_unit_energy = 0

        on_chip_energy = res_unit_energy["DSP"] * activity["compute_stmt_call_num"]
        on_chip_energy += res_unit_energy["BRAM18K"] * activity["io_module_mem_acc_num"] + \
                          res_unit_energy["BRAM18K"] * (activity["pe_module_mem_acc_num"] + activity["pe_module_reg_acc_num"])
        on_chip_energy += hop_unit_energy * activity["noc_hop_num"]
        off_chip_energy = dram_unit_energy * activity["off_chip_acc_num"]                

        return (on_chip_energy + off_chip_energy) / 1e9        
        '''
                
        # Eyeriss model (normalized)
        res_unit_energy = {        
            "RF": 1,
            "ALU": 1,
            "GlobalBuf": 6
        }
        dram_unit_energy = 200
        hop_unit_energy = 2

        '''
        # Interstellar model (pJ)
        res_unit_energy = {        
            "RF": 0.03, 
            "ALU": 0.075,
            "GlobalBuf": 6
        }
        dram_unit_energy = 200
        hop_unit_energy = 0.035              
        '''

        on_chip_energy = res_unit_energy["ALU"] * activity["compute_stmt_call_num"]
        on_chip_energy += res_unit_energy["GlobalBuf"] * activity["io_module_mem_acc_num"] + \
                          res_unit_energy["GlobalBuf"] * activity["pe_module_mem_acc_num"] + \
                          res_unit_energy["RF"] * activity["pe_module_reg_acc_num"]        
        on_chip_energy += hop_unit_energy * activity["noc_hop_num"]
        off_chip_energy = dram_unit_energy * activity["off_chip_acc_num"]

        return (on_chip_energy + off_chip_energy) / 1e9        

    def compute_dsp_eff(self, latency, dsp):
        """ Compute the DSP efficiency of the current design.
        Note: Only works for FP32 on Xilinx FPGA
        """
        return (self.compute_ops() / (dsp / 5 * 2)) / latency

    def compute_ops(self):
        """ Compute the total amount of operations of the workload.
        """        
        if "gemm" in self.workload["tags"]:
            return self.workload["params"]["i"] * self.workload["params"]["j"] * self.workload["params"]["k"] * 2
        elif "conv" in self.workload["tags"]:
            return self.workload["params"]["i"] * self.workload["params"]["o"] * self.workload["params"]["r"] * self.workload["params"]["c"] * self.workload["params"]["p"] * self.workload["params"]["q"] * 2
        else:
            raise RuntimeError(f"Not supported workload: {self.workload['name']}")

    def compute_bw(self, params):
        """ Compute the bandwidth requirement of the task.
        Note: Only works for 32-bit data
        """
        latency, _ = self.design.est_latency(params)
        off_chip_trans = self.est_off_chip_trans(params)
        bw = off_chip_trans * self.dw / (latency / (self.fre * 1e6)) / 1e9 # GB/s
        
        return bw

    def est_off_chip_trans(self, params):        
        activity = self.design.est_activity(params)
        off_chip_acc_num_meta = activity['off_chip_acc_num_meta']
        if "conv" in self.workload["tags"]:
            cin_trans = 0
            w_trans = 0
            cout_trans = 0
            for module in off_chip_acc_num_meta:
                if module.startswith("cin"):
                    cin_trans = off_chip_acc_num_meta[module]
                if module.startswith("w"):
                    w_trans = off_chip_acc_num_meta[module]
                if module.startswith("cout"):
                    cout_trans = off_chip_acc_num_meta[module]
            if "cin_read_mode" in self.configs:
                if self.configs["cin_read_mode"] == 2 or self.configs["cin_read_mode"] == 3:
                    cin_trans = 0
            if "cout_write_mode" in self.configs:
                if self.configs["cout_write_mode"] == 1:
                    cout_trans = 0
            if "w_read_mode" in self.configs:
                if self.configs["w_reads_mode"] == 1:
                    w_trans = 0
            return cin_trans + w_trans + cout_trans
        else:
            return activity["off_chip_acc_num"]        
        
        '''
        if "gemm" in self.workload["tags"]:            
            i, j, k = self.workload["params"]['i'], self.workload["params"]['j'], self.workload["params"]['k']
            i_t1, j_t1, k_t1 = params['i_t1'], params['j_t1'], params['k_t1']
            trans = np.ceil(i / i_t1) * np.ceil(j / j_t1) * np.ceil(k / k_t1) * (i_t1 * k_t1 + j_t1 * k_t1) + \
                    np.ceil(i / i_t1) * np.ceil(j / j_t1) * (i_t1 * j_t1)
        elif "conv" in self.workload["tags"]:
            i, o, r, c, p, q = self.workload["params"]["i"], self.workload["params"]["i"], \
                               self.workload["params"]["r"], self.workload["params"]["c"], \
                               self.workload["params"]["p"], self.workload["params"]["q"]
            i_t1, o_t1, r_t1, c_t1 = params["i_t1"], params["o_t1"], \
                                     params["r_t1"], params["c_t1"]
            cin_trans = i_t1 * (r_t1 + p - 1) * (c_t1 + q - 1)
            w_trans = i_t1 * o_t1 * p * q
            cout_trans = o_t1 * r_t1 * c_t1
            if "cin_read_mode" in self.configs:
                if self.configs["cin_read_mode"] == 2 or self.configs["cin_read_mode"] == 3:
                    cin_trans = 0
            if "cout_write_mode" in self.configs:
                if self.configs["cout_write_mode"] == 1:
                    cout_trans = 0
            if "w_read_mode" in self.configs:
                if self.configs["w_reads_mode"] == 1:
                    w_trans = 0
            trans = np.ceil(i / i_t1) * np.ceil(o / o_t1) * np.ceil(r / r_t1) * np.ceil(c / c_t1) * \
                    (cin_trans + w_trans) + \
                    np.ceil(o / o_t1) * np.ceil(r / r_t1) * np.ceil(c / c_t1) * cout_trans
        else:
            raise RuntimeError(f"Not supported task: {self.task['name']}")

        return trans
        '''        

    def compute_ctc(self, params):
        """ Compute the compute-to-communication ratio of the task.
        """
        ops = self.compute_ops()
        off_chip_trans = self.est_off_chip_trans(params)
        comm = off_chip_trans * self.dw
        ctc = ops / comm

        return ctc

    def set_arch_cst(self, arch_cst):
        self.fixed = 1
        self.arch_cst = arch_cst.copy()
    
    def clear_arch_cst(self):
        self.fixed = 0
        self.arch_cst = None

    def set_arch_sol(self, sol):
        self.arch_sol = sol

    def set_aux_func(self, tag, func_name):
        """ Set the auxiliary functions.
        tag refers to the function tag.
        func_name points to pre-defined functions.
        """
        self.aux_funcs[tag] = func_name

    def call_aux_func(self, tag):
        # Preset functions
        # Update the cin load latency
        def update_cin_latency_last(lat, task, sol):
            lat *= np.ceil(task.workload["params"]['i'] / sol['i_t1'])
            return lat
        # Update the cin on-chip buffer
        def update_cin_buf_bram_last(task, sol, width, depth):
            depth *= np.ceil(task.workload["params"]['i'] / sol['i_t1'])
            #mem = np.ceil(width / 18) * np.ceil(depth / 1024)
            mem = np.ceil(width / 36) * np.ceil(depth / 512)
            return mem
        # Update the cin on-chip buffer
        def update_cin_buf_uram_last(task, sol, data_pack):
            depth = task.workload["params"]['i'] * sol['r_t1'] * sol['c_t1']
            mem = np.ceil(task.dw * 8 * data_pack / 72) * np.ceil(depth / data_pack / 4096)
            return mem
        # Update the cin load latency
        def update_cin_latency(lat, task, sol):
            lat *= (np.ceil(task.workload["params"]['i'] / sol['i_t1']) * \
                    np.ceil(task.workload["params"]['r'] / sol['r_t1']) * \
                    np.ceil(task.workload["params"]['c'] / sol['c_t1']))
            return lat
        # Update the cin on-chip buffer
        def update_cin_buf_bram(task, sol, width, depth):
            depth *= (np.ceil(task.workload["params"]['i'] / sol['i_t1']) * \
                      np.ceil(task.workload["params"]['r'] / sol['r_t1']) * \
                      np.ceil(task.workload["params"]['c'] / sol['c_t1']))            
            #mem = np.ceil(width / 18) * np.ceil(depth / 1024)
            mem = np.ceil(width / 36) * np.ceil(depth / 512)
            return mem
        # Update the cin on-chip buffer    
        def update_cin_buf_uram(task, sol, data_pack):
            depth = task.workload["params"]['i'] * task.workload["params"]['r'] * task.workload["params"]['c']
            mem = np.ceil(task.dw * 8 * data_pack / 72) * np.ceil(depth / data_pack / 4096)
            return mem

        if self.aux_funcs[tag] == 'update_cin_latency_last':
            return update_cin_latency_last
        elif self.aux_funcs[tag] == 'update_cin_buf_bram_last':
            return update_cin_buf_bram_last
        elif self.aux_funcs[tag] == 'update_cin_buf_uram_last':
            return update_cin_buf_uram_last
        elif self.aux_funcs[tag] == 'update_cin_latency':
            return update_cin_latency
        elif self.aux_funcs[tag] == 'update_cin_buf_bram':
            return update_cin_buf_bram
        elif self.aux_funcs[tag] == 'update_cin_buf_uram':
            return update_cin_buf_uram
        else:
            raise RuntimeError(f'Not supported function: {tag}')

    def clear_aux_func(self):
        self.aux_funcs = {}

class MultiTask(object):
    """ Search task object used by the tuner.
    # TODO: To be modified
    """
    def __init__(self, design, search_tasks, hw_cst, fuse=0, max_latency=-1, split=0, use_uram=0):
        self.design = design
        self.tasks = search_tasks

        self.hw_cst = hw_cst
        self.fre = 200 # 200 MHz
        self.dw = 4 # bytes
        self.dt = "float"
        self.fuse = fuse
        self.max_latency = max_latency
        self.split = split
        self.use_uram = use_uram
        for task in self.tasks:
            task.use_uram = use_uram
        # Fixed architecture solution
        self.fixed = 0
        self.arch_sol = None
        self.arch_cst = None
        # Other configs
        self.configs = {}
        if isinstance(self.design, Design):
            # Initialize the external params, using the largest dimensions        
            self.workload = {"params": {}}
            for p, param in self.design.params_config["external"].items():
                self.workload["params"][param["name"]] = 1
            for task in self.tasks:
                for p, param in self.design.params_config["external"].items():
                    self.workload["params"][param["name"]] = max(self.workload["params"][param["name"]], task.workload["params"][param["name"]])

    def __repr__(self):
        ret = ""
        for task in self.tasks:
            ret += str(task)        
        if isinstance(self.design, Design):
            ret += f'_d_{self.design.name}'
        else:
            for design in self.design:
                ret += f'_d_{design.name}'
        ret += f'_cst_{self.hw_cst}'
        ret += f'_f_{self.fuse}'
        ret += f'_s_{self.split}'
        ret += f'_u_{self.use_uram}'
        if len(self.configs) > 0:
            ret += f'_config_'
            for k, v in self.configs:
                ret += f'{k}{v}'

        return ret    

    def generate_random_sample(self):
        """ Generate a random sample in the design space.
        """
        workload_params = {}
        for param in self.workload["params"]:
            workload_params[param] = self.workload["params"][param]
        return self.design.random_sampling(workload_params)    

    def compute_dsp_eff(self, latency, dsp):
        """ Compute the DSP efficiency of the current design.
        Note: Only works for FP32 on Xilinx FPGA
        """
        return (self.compute_ops() / (dsp / 5 * 2)) / latency

    def compute_ops(self):
        """ Compute the total amount of operations of the task.
        """
        total_ops = 0
        for task in self.tasks:
            if "gemm" in task.workload["tags"]:
                total_ops += task.workload["params"]["i"] * task.workload["params"]["j"] * task.workload["params"]["k"] * 2
            elif "conv" in task.workload["tags"]:
                total_ops += task.workload["params"]["i"] * task.workload["params"]["o"] * task.workload["params"]["r"] * task.workload["params"]["c"] * task.workload["params"]["p"] * task.workload["params"]["q"] * 2            
            else:
                raise RuntimeError(f"Not supported workload: {task.workload['tags']}")
        return total_ops

    def compute_arch_cst(self, params):
        """ Compute the architecture constraints.
        """
        arch_cst = None
        for task in self.tasks:
            cur_arch_cst = task.compute_arch_cst(params)
            # Take the one with looser contraints
            if not arch_cst:
                arch_cst = cur_arch_cst
            else:
                # dims
                for idx in range(len(arch_cst['dims'])):
                    arch_cst['dims'][idx] = max(arch_cst['dims'][idx], cur_arch_cst['dims'][idx])
                # SIMD
                arch_cst["SIMD"] = max(arch_cst["SIMD"], cur_arch_cst["SIMD"])
                # data pack
                for arr in arch_cst['data_pack']:
                    for idx in range(len(arch_cst['data_pack'][arr])):
                        arch_cst['data_pack'][arr][idx] = max(arch_cst['data_pack'][arr][idx], cur_arch_cst['data_pack'][arr][idx])
                # resource
                for module in arch_cst['resource']:
                    if module.endswith("unit_memory"):
                        arch_cst["resource"][module] = max(arch_cst["resource"][module], cur_arch_cst["resource"][module])
        
        return arch_cst

    def set_arch_cst(self, arch_cst):
        """ Set the architecture constraints.
        """
        self.fixed = 1
        self.arch_cst = arch_cst.copy()
        # Set the subtasks
        for task in self.tasks:
            task.set_arch_cst(arch_cst.copy())

    def clear_arch_cst(self):
        self.fixed = 0
        self.arch_cst = None
        for task in self.tasks:
            task.clear_arch_cst()

    def set_arch_sol(self, sol):
        self.fixed = 1
        self.arch_sol = sol
        for task in self.tasks:
            task.set_arch_sol(sol)

================================================
FILE: autosa_scripts/odyssey/solver.py
================================================
from subprocess import Popen, PIPE
import tempfile
import shutil

def off_chip_solver_gemm(search_task, cst, fixed_params=None, save=0):
    """ If any parameter found in fixed_params, this parameter will not be tiled.
    """
    with tempfile.TemporaryDirectory() as tmpdirname:
        # Generate the model file
        with open(f'{tmpdirname}/tmp.mod', 'w') as f:
            for p in ["i", "j", "k"]:
                f.write(f'param {p};\n')            
            f.write('param dsp_bound;\n')
            f.write('param bram_bound;\n')
            f.write('param data_w;\n')
            
            for p in ["i", "j", "k"]:
                f.write(f'var {p}1 integer >= 1, <= {p};\n')
            for p in ["i", "j"]:
                f.write(f'var {p}2 integer >= 1, <= {p};\n')
            for p in ["k"]:
                f.write(f'var {p}2 integer >= 1, <= {32/search_task.dw};\n')
            
            for p in ["i", "j", "k"]:            
                f.write(f'var c{p}1 integer >= 1, <= {p};\n')
                f.write(f'var c{p}2 integer >= 1, <= {p};\n')
            for p in ["k"]:
                f.write(f'var c{p}3 integer >= 1, <= {p};\n')            
            
            f.write('minimize target:\n')
            # off_chip/DSP
            #f.write('\t(i*cj1*k+ci1*j*k+i*j)/\n')
            #if search_task.design.name.startswith("kernel0"):
            #    f.write('\t(ci2*k2);\n\n')
            #elif search_task.design.name.startswith("kernel1"):
            #    f.write('\t(cj2*k2);\n\n')
            #elif search_task.design.name.startswith("kernel2"):
            #    f.write('\t(k1);\n\n')
            #elif search_task.design.name.startswith("kernel3"):
            #    f.write('\t(ci2*cj2*k2);\n\n')
            #elif search_task.design.name.startswith("kernel4"):
            #    f.write('\t(ci2*k1);\n\n')
            #elif search_task.design.name.startswith("kernel5"):
            #    f.write('\t(cj2*k1);\n\n')
            
            # off_chip
            #f.write('\t(i*cj1*k+ci1*j*k+i*j);\n\n')

            # compute
            #f.write('\t-(ci2*cj2*k2);\n\n')

            # off_chip - compute            
            f.write('\t(i*cj1*k+ci1*j*k+i*j)-\n')
            if search_task.design.name.startswith("kernel0"):
                f.write('\t(ci2*k2);\n\n')
            elif search_task.design.name.startswith("kernel1"):
                f.write('\t(cj2*k2);\n\n')
            elif search_task.design.name.startswith("kernel2"):
                f.write('\t(k1);\n\n')
            elif search_task.design.name.startswith("kernel3"):
                f.write('\t(ci2*cj2*k2);\n\n')
            elif search_task.design.name.startswith("kernel4"):
                f.write('\t(ci2*k1);\n\n')
            elif search_task.design.name.startswith("kernel5"):
                f.write('\t(cj2*k1);\n\n')

            if search_task.design.name.startswith("kernel0"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= ci2*1*k2*5 <= dsp_bound;\n\n') # Only works for FP32
            elif search_task.design.name.startswith("kernel1"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cj2*1*k2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel2"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= k1*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel3"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= ci2*cj2*k2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel4"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= ci2*k1*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel5"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cj2*k1*5 <= dsp_bound;\n\n')
            
            f.write('subject to BRAM_cst:\n')
            #f.write('\t0 <= (data_w*i1*k1)/(18*1024)*2+\n')
            f.write('\tceil(data_w/18)*ceil(i1*k1/1024)*2+\n')
            #f.write('\t     (data_w*j1*k1)/(18*1024)*2+\n')
            f.write('\tceil(data_w/18)/ceil(j1*k1/1024)*2+\n')
            #f.write('\t     (data_w*i1*j1)/(18*1024)*2 <= bram_bound;\n\n')
            f.write('\tceil(data_w/18)/ceil(i1*j1/1024)*2 <= bram_bound;\n\n')

            for p in ["i", "j", "k"]:
                f.write(f'subject to c{p}1_cst:\n')
                f.write(f'\t{p} = c{p}1*{p}1;\n\n')
            for p in ["i", "j", "k"]:
                f.write(f'subject to c{p}2_cst:\n')
                f.write(f'\t{p}1 = c{p}2*{p}2;\n\n')
            for p in ["k"]:
                f.write(f'subject to c{p}3_cst:\n')
                f.write(f'\t{p}2 = c{p}3*2;\n\n') # even number

            if search_task.design.name.startswith("kernel0") or \
               search_task.design.name.startswith("kernel1") or \
               search_task.design.name.startswith("kernel3"):             
                f.write('subject to latency_hiding_cst:\n')
                f.write('\ti2*j2 >= 8*k2;\n\n') # Only for FP32
            
        with open(f'{tmpdirname}/tmp.dat', 'w') as f:
            for p in ["i", "j", "k"]:
                f.write(f'param {p} := {search_task.workload["params"][p]};\n')            
            f.write(f'param dsp_bound := {int(cst.hw_cst["DSP"])};\n')
            f.write(f'param bram_bound := {int(cst.hw_cst["BRAM18K"])};\n')
            f.write(f'param data_w := 32;\n') # Only for FP32           

        # Generate the AMPL script
        with open(f'{tmpdirname}/tmp.run', 'w') as f:
            f.write('option solver ipopt;\n')
            f.write('reset;\n')
            f.write('model ./solver/tmp.mod;\n')
            f.write('data ./solver/tmp.dat;\n')
            f.write('solve;\n')
            f.write('display target,i1,j1,k1,i2,j2,k2;\n')
        
        # Call the solver    
        cmd = ["ampl", f"{tmpdirname}/tmp.run"]
        pipe = Popen(cmd, stdout=PIPE, stderr=PIPE)
        text = pipe.communicate()[0].decode('ascii')

        # Collect the results
        text = text.split('\n')
        #print(text)
        opt_dims = [1, 1, 1, 1, 1, 1]
        update = 0
        for line in text:
            if line.startswith("i1 = "):
                opt_dims[0] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("j1 = "):
                opt_dims[1] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("k1 = "):
                opt_dims[2] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("i2 = "):
                opt_dims[3] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("j2 = "):
                opt_dims[4] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("k2 = "):
                opt_dims[5] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
        
        #print(update, opt_dims)
        if update != len(opt_dims):
            # The solver isn't finished correctly.
            opt_dims = None

        if save == 1:
            shutil.copyfile(f'{tmpdirname}/tmp.mod', 'solver/tmp.mod')
            shutil.copyfile(f'{tmpdirname}/tmp.dat', 'solver/tmp.dat')
            shutil.copyfile(f'{tmpdirname}/tmp.run', 'solver/tmp.run')
    
    return opt_dims

def off_chip_solver_conv(search_task, cst, fixed_params=None, save=0):
    """ If any parameter found in fixed_params, this parameter will not be tiled.
    """
    with tempfile.TemporaryDirectory() as tmpdirname:
        # Generate the model file
        with open(f'{tmpdirname}/tmp.mod', 'w') as f:
            for p in ["i", "o", "r", "c", "p", "q"]:
                f.write(f'param {p};\n')            
            f.write('param dsp_bound;\n')
            f.write('param bram_bound;\n')
            f.write('param data_w;\n')
            
            for p in ["i", "o", "r", "c"]:
                f.write(f'var {p}1 integer >= 1, <= {p};\n')            
            for p in ["o", "r", "c"]:
                f.write(f'var {p}2 integer >= 1, <= {p};\n')
            for p in ["i"]:
                f.write(f'var {p}2 integer >= 1, <= {32/search_task.dw};\n')
            
            for p in ["i", "o", "r", "c"]:
                f.write(f'var c{p}1 integer >= 1, <= {p};\n')
                f.write(f'var c{p}2 integer >= 1, <= {p};\n')
            for p in ["i"]:
                f.write(f'var c{p}3 integer >= 1, <= {p};\n')
            
            f.write('minimize target:\n')
            # off_chip/DSP
            # Ignore the padded data
            f.write('\t(i*r*c*co1+i*o*p*q*cr1*cc1+o*r*c*ci1)/\n')                        
            if search_task.design.name.startswith("kernel0"):
                f.write('\t(co2*i2);\n\n')
            elif search_task.design.name.startswith("kernel1"):
                f.write('\t(cr2*i2);\n\n')
            elif search_task.design.name.startswith("kernel2"):
                f.write('\t(cc2*i2);\n\n')
            elif search_task.design.name.startswith("kernel3"):
                f.write('\t(ci2*i2);\n\n')
            elif search_task.design.name.startswith("kernel4"):
                f.write('\t(co2*cr2*i2);\n\n')
            elif search_task.design.name.startswith("kernel5"):
                f.write('\t(co2*cc2*i2);\n\n')                
            elif search_task.design.name.startswith("kernel6"):
                f.write('\t(co2*ci2*i2);\n\n')
            elif search_task.design.name.startswith("kernel7"):
                f.write('\t(cr2*cc2*i2);\n\n')                
            elif search_task.design.name.startswith("kernel8"):
                f.write('\t(cr2*ci2*i2);\n\n')       
            elif search_task.design.name.startswith("kernel9"):
                f.write('\t(cc2*ci2*i2);\n\n')
            else:
                raise RuntimeError(f"Not supported design by the solver: {search_task.design.name}")            

            if search_task.design.name.startswith("kernel0"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= co2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel1"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cr2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel2"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cc2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel3"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= ci2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel4"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= co2*cr2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel5"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= co2*cc2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel6"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= co2*ci2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel7"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cr2*cc2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel8"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cr2*ci2*i2*5 <= dsp_bound;\n\n')
            elif search_task.design.name.startswith("kernel9"):
                f.write('subject to DSP_cst:\n')
                f.write('\t0 <= cc2*ci2*i2*5 <= dsp_bound;\n\n')
            else:
                raise RuntimeError(f"Not supported design by the solver: {search_task.design.name}")
                        
            f.write('subject to BRAM_cst:\n')
            f.write('\t0 <= (data_w*i1*r1*c1)/(18*1024)*2+\n')
            f.write('\t     (data_w*i1*o1*p*q)/(18*1024)*2+\n')                
            f.write('\t     (data_w*o1*r1*c1)/(18*1024)*2 <= bram_bound;\n\n')            

            for p in ["i", "o", "r", "c"]:
                f.write(f'subject to c{p}1_cst:\n')
                f.write('\t{p} = c{p}1*{p}1;\n\n')
            for p in ["i", "o", "r", "c"]:
                f.write(f'subject to c{p}2_cst:\n')
                f.write('\t{p}1 = c{p}2*{p}2;\n\n')                
            for p in ["i"]:
                f.write(f'subject to c{p}3_cst:\n')
                f.write(f'\t{p}2 = c{p}3*2;\n\n') # even number   

            # TODO: Add other dataflows
            if search_task.design.name.startswith("kernel0") or \
               search_task.design.name.startswith("kernel1") or \
               search_task.design.name.startswith("kernel2") or \
               search_task.design.name.startswith("kernel4") or \
               search_task.design.name.startswith("kernel5") or \
               search_task.design.name.startswith("kernel7"):             
                f.write('subject to latency_hiding_cst:\n')
                f.write('\to2*r2*c2 >= 8*i2;\n\n') # Only for FP32
            
        with open(f'{tmpdirname}/tmp.dat', 'w') as f:
            for p in ["i", "o", "r", "c"]:
                f.write(f'param {p} := {search_task.workload["params"][p]};\n')            
            f.write(f'param dsp_bound := {int(cst.hw_cst["DSP"])};\n')
            f.write(f'param bram_bound := {int(cst.hw_cst["BRAM18K"])};\n')
            f.write(f'param data_w := 32;\n') # Only for FP32           

        # Generate the AMPL script
        with open(f'{tmpdirname}/tmp.run', 'w') as f:
            f.write('option solver ipopt;\n')
            f.write('reset;\n')
            f.write('model ./solver/tmp.mod;\n')
            f.write('data ./solver/tmp.dat;\n')
            f.write('solve;\n')
            f.write('display target,i1,o1,r1,c1,i2,o2,r2,c2;\n')
        
        # Call the solver    
        cmd = ["ampl", f"{tmpdirname}/tmp.run"]
        pipe = Popen(cmd, stdout=PIPE, stderr=PIPE)
        text = pipe.communicate()[0].decode('ascii')

        # Collect the results
        text = text.split('\n')
        #print(text)
        opt_dims = [1, 1, 1, 1, 1, 1, 1, 1]
        update = 0
        for line in text:
            if line.startswith("i1 = "):
                opt_dims[0] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("o1 = "):
                opt_dims[1] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("r1 = "):
                opt_dims[2] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("c1 = "):
                opt_dims[3] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("i2 = "):
                opt_dims[4] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("o2 = "):
                opt_dims[5] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("r2 = "):
                opt_dims[5] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
            if line.startswith("c2 = "):
                opt_dims[5] = int(float(line.split('=')[-1].strip()) + 0.5)
                update += 1
                
        if update != len(opt_dims):
            # The solver isn't finished correctly.
            opt_dims = None

        if save == 1:
            shutil.copyfile(f'{tmpdirname}/tmp.mod', 'solver/tmp.mod')
            shutil.copyfile(f'{tmpdirname}/tmp.dat', 'solver/tmp.dat')
            shutil.copyfile(f'{tmpdirname}/tmp.run', 'solver/tmp.run')
    
    return opt_dims    

def off_chip_solver(search_task, cst, fixed_params=None, save=0):
    """ Run the solver to minimize the off-chip data communication.
    """
    if "gemm" in search_task.workload["tags"]:
        return off_chip_solver_gemm(search_task, cst, fixed_params, save)
    elif "conv" in search_task.workload["tags"]:
        return off_chip_solver_conv(search_task, cst, fixed_params, save)
    else:
        RuntimeError(f"Not supported task: {search_task.workload['name']}")

================================================
FILE: autosa_scripts/odyssey/tuners.py
================================================
import json
import numpy as np
import xgboost as xgb
import random
import sys
import shutil
import copy
import pprint
from bayes_opt import BayesianOptimization
import itertools
import csv
from scipy import optimize
import math
import time
from datetime import datetime
from collections import deque

import utils
from solver import off_chip_solver
from search_task import MultiTask, SingleTask

#import opentuner
#from opentuner import ConfigurationManipulator
#from opentuner import IntegerParameter
#from opentuner import MeasurementInterface
#from opentuner import Result
#from opentuner.search.manipulator import PowerOfTwoParameter
#
#from RL_utils import RLAgent, RLEnv

class Constraint(object):
    def __init__(self, cst_path):
        with open(cst_path) as f:
            data = json.load(f)
        # Update the constraints
        self.hw_cst = {}
        for res in data:
            self.hw_cst[res] = data[res]["total"] * data[res]["ratio"]
            self.hw_cst[f'{res}_total'] = data[res]["total"]

    def __repr__(self):
        ret = ""
        ret += f"b{int(self.hw_cst['BRAM18K'])}"
        ret += f"d{int(self.hw_cst['DSP'])}"
        ret += f"u{int(self.hw_cst['URAM'])}"
        return ret

class Tuner(object):
    def __init__(self, search_task, cst, search_obj, max_epoch, max_time, n_worker=1, silent=0, max=1):
        self.search_task = search_task
        self.cst = cst
        self.search_obj = search_obj
        self.max_epoch = max_epoch
        self.max_time = max_time
        self.max = max
        if self.max == 1:
            self.best_reward = 0
        else:
            self.best_reward = float('inf')
        self.best_reward_meta = None
        self.best_rewards = []
        self.best_rewards_time = []
        self.best_sol = None
        self.best_sol_cst = None
        self.last_update_epoch = -1
        self.best_search_record = utils.SearchRecord().reset()
        self.converge_time = 0
        self.silent = silent
        self.sub_task_silent = silent
        self.n_worker = n_worker
        # If multi-processing, silent the sub tasks
        if n_worker > 1:
            self.sub_task_silent = 1

    def log(self, str, force=0):
        """ If force is set to 1, we will print the log info regardless of the silence argument.
        """
        if not self.silent or force:
            import logging
            logger = logging.getLogger('AutoSA-Tuner')
            logger.info(str)
            sys.stdout.flush()

    def overuse_constraint(self, used_cst):
        if not used_cst:
            # If constraint doesn't exist, return True to exclude this design
            return True

        if used_cst['BRAM18K'] > self.cst.hw_cst['BRAM18K']:
            return True
        if used_cst['DSP'] > self.cst.hw_cst['DSP']:
            return True
        if used_cst['URAM'] > self.cst.hw_cst['URAM']:
            return True

        return False

def exhaustive_search(search_task, cst, search_obj, max_epochs, max_time, n_worker=1, silent=0, time_out=-1, pruning=0, profiling=0):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {
        "pruning": pruning,
        "DSP_thres": [0.95, 1.0]
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = ExhaustiveTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_exhaustive"
            if pruning:
                config_str += "_pruning"

            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class ExhaustiveTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name
        self.params_history = []

    def search(self):
        """ This tuner only works for GEMM (kernel3) """
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        def filter_non_power_of_two(x):
            if np.log2(x) != int(np.log2(x)):
                return True
            return False

        #print(self.cst.hw_cst["DSP"])

        i, j, k = self.search_task.workload["params"]["i"], self.search_task.workload["params"]["j"], self.search_task.workload["params"]["k"]
        if not self.params["pruning"]:
            for i_t1 in range(1, i + 1):
                for j_t1 in range(1, j + 1):
                    for k_t1 in range(1, k + 1):
                        for i_t2 in utils.get_divisors(int(i_t1), None):
                            for j_t2 in utils.get_divisors(int(j_t1), None):
                                for k_t2 in utils.get_divisors(int(min(k_t1,8)), filter_non_power_of_two):
                                    latency_factors = 1
                                    latency_factors *= i_t2
                                    latency_factors *= j_t2
                                    simd_factor = k_t2
                                    if latency_factors >= 8 * simd_factor:
                                    	continue
                                    params = {
                                        "i": i, "j": j, "k": k,
                                        "i_t1": i_t1, "j_t1": j_t1, "k_t1": k_t1,
                                        "i_t2": i_t2, "j_t2": j_t2, "k_t2": k_t2,
                                    }
                                    task_params = self.search_task.adjust_params(task_params)
                                    reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
                                    if self.overuse_constraint(used_constraint):
                                        reward = 0
                                    if reward > self.best_reward:
                                        self.best_reward = reward
                                        self.best_reward_meta = reward_meta
                                        self.best_sol_cst = used_constraint
                                        self.best_sol = task_params
                                        self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                                        #self.last_update_epoch = self.epoch
                                        #self.counter.update_counter('converge_time')
                                        self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)
                                    #self.best_rewards.append(self.best_reward)
                                    #self.counter.update_counter('time')
                                    #self.best_rewards_time.append(self.counter.get_counter('time'))

                                    #if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                                    #    break
                                    #if self.stop_criteria == "time":
                                    #    self.counter.update_counter('time')
                                    #    if self.counter.get_counter('time') > self.max_time:
                                    #        break
        else:
            #for i_t1 in range(1, i + 1):
            #for i_t1 in range(int(i/6), int(i/2)):
            for i_t1 in range(200, 270):
                if i_t1 % 2 != 0:
                    continue
                #for j_t1 in range(1, j + 1):
                #for j_t1 in range(int(j/6), int(j/2)):
                for j_t1 in range(200, 270):
                    if j_t1 % 2 != 0:
                        continue
                    #for k_t1 in range(4, int(k/8)):
                    for k_t1 in range(16, 64):
                        if k_t1 % 2 != 0:
                            continue
                        for i_t2 in utils.get_divisors(int(i_t1), None):
                            if i_t2 % 2 != 0:
                                continue
                            for j_t2 in utils.get_divisors(int(j_t1), None):
                                if j_t2 % 2 != 0:
                                    continue
                                if (i_t1 / i_t2) * (j_t1 / j_t2) < 200:
                                    continue
                                if (i_t1 / i_t2) * (j_t1 / j_t2) > 240:
                                    continue
                                if 8 not in utils.get_divisors(int(min(k_t1,8))):
                                    continue
                                #if 4 not in utils.get_divisors(int(min(k_t1,8))):
                                #    continue
                                for k_t2 in [8]:
                                #for k_t2 in utils.get_divisors(int(min(k_t1,8)), filter_non_power_of_two):
                                    latency_factors = 1
                                    latency_factors *= i_t2
                                    latency_factors *= j_t2
                                    simd_factor = k_t2
                                    if latency_factors < 8 * simd_factor:
                                    	continue

                                    dsp_usage = (i_t1 / i_t2) * (j_t1 / j_t2) * k_t2 * 5
                                    if dsp_usage / self.cst.hw_cst["DSP"] < self.params["DSP_thres"][0] or \
                                       dsp_usage / self.cst.hw_cst["DSP"] > self.params["DSP_thres"][1]:
                                        continue

                                    task_params = {
                                        "i": i, "j": j, "k": k,
                                        "i_t1": i_t1, "j_t1": j_t1, "k_t1": k_t1,
                                        "i_t2": i_t2, "j_t2": j_t2, "k_t2": k_t2,
                                    }
                                    task_params = self.search_task.adjust_params(task_params)
                                    reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
                                    if self.overuse_constraint(used_constraint):
                                        reward = 0
                                    if reward > self.best_reward:
                                        self.best_reward = reward
                                        self.best_reward_meta = reward_meta
                                        self.best_sol_cst = used_constraint
                                        self.best_sol = task_params
                                        self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                                        self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)

def random_search(search_task, cst, search_obj, max_epochs, max_time, n_worker=1, silent=0, time_out=-1, pruning=0, profiling=0):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {
        "pruning": pruning,
        "DSP_thres": [0.6, 1.0]
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = RandomTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_random"
            if pruning:
                config_str += "_pruning"
            
            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class RandomTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name
        self.params_history = []

    def generate_random_sample(self):
        """ Generate a random sample from the design space.
        We bookkeeping all the searched params to avoid duplicated search.
        """
        duplicate = True
        cnt = 0
        task_params = None
        while duplicate:
            task_params = self.search_task.generate_random_sample()
            # Serialize the params
            params_hash = ""
            for k, v in task_params.items():
                params_hash += str(v)
            if params_hash not in self.params_history:
                duplicate = False
                self.params_history.append(params_hash)
            cnt += 1
            if cnt > 20:
                break

        return task_params

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        while True:            
            task_params = None
            if self.params["pruning"]:
                while True:
                    task_params = self.generate_random_sample()
                    if not task_params:
                        break
                    self.epoch += 1
                    self.best_rewards.append(self.best_reward)
                    self.counter.update_counter('time')
                    self.best_rewards_time.append(self.counter.get_counter('time'))

                    task_params = self.search_task.adjust_params(task_params)
                    task_params = self.search_task.design.infer_params(task_params)
                    dsp_usage = task_params["i_t1"] / task_params["i_t2"] * task_params["j_t1"] / task_params["j_t2"] * task_params["k_t2"] * 5
                    if task_params["k_t2"] == 8 and \
                       dsp_usage / self.cst.hw_cst["DSP"] >= self.params["DSP_thres"][0] and \
                       dsp_usage / self.cst.hw_cst["DSP"] <= self.params["DSP_thres"][1]:
                       break
                    '''
                    resource, _ = self.search_task.design.est_resource(task_params)
                    # Estimate the resource
                    if resource["DSP"] / self.cst.hw_cst["DSP"] >= self.params["DSP_thres"][0] and \
                       resource["DSP"] / self.cst.hw_cst["DSP"] <= self.params["DSP_thres"][1]:
                        break
                    '''
            else:
                task_params = self.generate_random_sample()
                self.epoch += 1
                self.best_rewards.append(self.best_reward)
                self.counter.update_counter('time')
                self.best_rewards_time.append(self.counter.get_counter('time'))
            if not task_params:
                # Design space is exhausted
                break
            task_params = self.search_task.adjust_params(task_params)
            reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
            if self.overuse_constraint(used_constraint):
                reward = 0
            if reward > self.best_reward:
                self.best_reward = reward
                self.best_reward_meta = reward_meta
                self.best_sol_cst = used_constraint
                self.best_sol = task_params
                self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                self.last_update_epoch = self.epoch
                self.counter.update_counter('converge_time')
                self.converge_time = self.counter.get_counter('converge_time')
                self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)            

            if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break

        return

def annealing_search(search_task, cst, search_obj, max_epochs, max_time, n_worker=1, silent=0, time_out=-1, profiling=0):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {
        "T": 200,
        "stepsize": 16,
        "mutation_probability": 1.0,
        "epsilon": 0.1,
        "mutation_probs": [0.2, 0.8, 0],
        "max_latency": search_task.compute_ops()*10
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = AnnealingTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_annealing"

            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class AnnealingTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name

    def update(self, args):
        """ Optimization function
        """
        if (np.any(np.isnan(args))) or (np.any(np.isneginf(args))) or (np.any(np.isposinf(args))) or (np.any(args[:] == 0)):
            return self.params["max_latency"]
            #return float("inf")

        task_params = {}
        for p, param in self.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = args[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        #print(args)
        #print(task_params)
        task_params = self.search_task.adjust_params(task_params)
        reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
        # SA minimizes the opt target
        if reward == 0:
            reward = self.params["max_latency"]
            #reward = float("inf")
        else:
            reward = 1 / reward
        if self.overuse_constraint(used_constraint):
            reward = self.params["max_latency"]
            #reward = float("inf")

        return reward

    def bound_check(self, f_new, x_new, f_old, x_old):
        """ Check if the parameters are legal.
        """
        self.epoch += 1
        self.best_rewards.append(self.best_reward)
        self.counter.update_counter('time')
        self.best_rewards_time.append(self.counter.get_counter('time'))

        task_params = {}
        for p, param in self.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = x_new[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        task_params = self.search_task.adjust_params(task_params)
        task_params = self.search_task.design.infer_params(task_params)
        if task_params:
            status = self.search_task.design.bound_check(task_params)
            #print("bound_check: ", task_params, status)
            return status
        else:
            return False

    def print_minimal(self, x, f, accepted):
        """ Update the rewards when a local minimal is found.
        """
        task_params = {}
        for p, param in self.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = x[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        task_params = self.search_task.adjust_params(task_params)
        reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
        if self.overuse_constraint(used_constraint):
            reward = 0
        if reward > self.best_reward:
            self.best_reward = reward
            self.best_reward_meta = reward_meta
            self.best_sol_cst = used_constraint
            self.best_sol = task_params
            self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
            self.last_update_epoch = self.epoch
            self.counter.update_counter('converge_time')
            self.converge_time = self.counter.get_counter('converge_time')
            self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)        

    def take_step(self, x):
        """ Step-taking routine.
        Note: Only for gemm.
        """
        '''
        s = self.params["stepsize"]
        x[0:3] += np.random.uniform(-max(1,s), max(1,s), 3)
        x[3:5] += np.random.uniform(-max(1,int(.5*s)), max(1,int(.5*s)), 2)
        x[5] += np.random.uniform(-max(1,int(.25*s)), max(1,int(.25*s)))
        x = np.array([int(a) if int(a) > 0 else 1 for a in x])
        '''
        # Reuse the genetic search mutation method
        if random.random() < self.params["mutation_probability"]:
            if random.random() < self.params["epsilon"]:
                task_params = self.search_task.generate_random_sample()
                for i in range(len(x)):
                    x[i] = task_params[self.idx_param_map[i]]
            else:
                idv = x
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                for p, param in self.search_task.design.params_config["external"].items():
                    task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                # Build the chains
                # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                split_chains = []
                for p, param in self.search_task.design.params_config["external"].items():
                    chain = {"params": [param["name"]], "factors": []}
                    cur_param = param
                    while "split_by" in cur_param:
                        if "divisors" in self.search_task.design.params_config["tunable"][cur_param["split_by"]] \
                            and cur_param["name"] in self.search_task.design.params_config["tunable"][cur_param["split_by"]]["divisors"]:
                            div = 1
                        else:
                            div = 0
                        chain["params"].append(cur_param["split_by"])
                        if div:
                            factor = np.ceil(task_params[cur_param["name"]] / task_params[cur_param["split_by"]])
                        else:
                            factor = task_params[cur_param["name"]] / task_params[cur_param["split_by"]]
                        chain["factors"].append(max(1, int(factor)))
                        cur_param = self.search_task.design.params_config["tunable"][cur_param["split_by"]]
                    chain["factors"].append(max(1, int(task_params[cur_param["name"]])))
                    split_chains.append(chain)

                # Mutation
                for chain in split_chains:
                    if len(chain["factors"]) <= 1:
                        continue
                    if 'fix_param' in self.search_task.configs:
                        # Avoid mutating the fixed parameters
                        for fix_p in self.search_task.configs['fix_param']:
                            if fix_p[0] == chain['params'][0]:
                                continue
                    src_idx, dst_idx = random.sample(range(0, len(chain["factors"])), 2)
                    #mutation_policy_probs = [0.2, 0, 0.8] #
                    mutation_policy_probs = self.params["mutation_probs"]
                    mutation_policy_probs = np.cumsum(mutation_policy_probs)
                    #print(mutation_policy_probs)
                    select_prob = random.random()
                    if select_prob < mutation_policy_probs[0]:
                        # Random
                        if chain["factors"][dst_idx] == 1:
                            continue
                        """
                        inc_stride = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                        dec_stride = max(1, int(chain["factors"][dst_idx] - chain["factors"][src_idx] * chain["factors"][dst_idx] / (chain["factors"][src_idx] + inc_stride)))
                        chain["factors"][src_idx] += inc_stride
                        chain["factors"][dst_idx] -= dec_stride
                        chain["factors"][dst_idx] = max(1, chain["factors"][dst_idx])
                        """
                        src = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                        dst = max(1, math.ceil(chain["factors"][src_idx] * chain["factors"][dst_idx] / src))                        
                        chain["factors"][src_idx] = src
                        chain["factors"][dst_idx] = dst                    
                    elif select_prob < mutation_policy_probs[2]:
                        # Factorization
                        factor = chain["factors"][src_idx]
                        if factor == 1:
                            continue
                        divs = utils.factorization(factor)
                        div = random.choice(divs)
                        chain["factors"][src_idx] /= div
                        chain["factors"][dst_idx] *= div
                    else:
                        # Random
                        chain["factors"][src_idx] = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))

                # Revert to the params
                # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                for chain in split_chains:
                    factor = chain["factors"][-1]
                    param = chain["params"][-1]
                    if param in self.param_idx_map:
                        x[self.param_idx_map[param]] = factor
                    for idx in range(len(chain["factors"]) - 2, -1, -1):
                        param = chain["params"][idx]
                        factor *= chain["factors"][idx]
                        if param in self.param_idx_map:
                            x[self.param_idx_map[param]] = factor

        return x

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        # Init guess
        init_reward = 0
        init_params = None
        for i in range(5):
            task_params = self.search_task.generate_random_sample()
            task_params = self.search_task.adjust_params(task_params)
            reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
            if self.overuse_constraint(used_constraint):
                reward = 0
            if reward > init_reward:
                init_reward = reward
                init_params = task_params

        param_arr = []
        for p, param in self.search_task.design.params_config["tunable"].items():
            param_arr.append(task_params[param["name"]])
        x0 = np.array(param_arr)
        # Search
        optimize.basinhopping(self.update, x0, niter=self.max_epoch, \
                accept_test=self.bound_check,
                stepsize=self.params['stepsize'],
                T=self.params['T'], callback=self.print_minimal,
                take_step=self.take_step)

        return

def bayesian_search(search_task, cst, search_obj, max_epochs, max_time, n_worker=1, silent=0, time_out=-1, profiling=0):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {        
        "init_points": 10,
        "mutation_probability": 1.0,
        "epsilon": 0.1,
        "mutation_probs": [0.2, 0.8, 0],
        "max_latency": search_task.compute_ops()*10
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = BayesianTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_bayesian"

            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class BayesianTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name

    def black_box_function(self, i_t1, j_t1, k_t1, i_t2, j_t2, k_t2):        
        task_params = {
            "i_t1": int(i_t1), "j_t1": int(j_t1), "k_t1": int(k_t1),
            "i_t2": int(i_t2), "j_t2": int(j_t2), "k_t2": int(k_t2)
        }        

        #task_params = {}
        #for p, param in self.search_task.design.params_config["tunable"].items():
        #    task_params[param["name"]] = x_new[self.param_idx_map[param["name"]]]
        for p, param in self.search_task.design.params_config["external"].items():
            task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
        task_params = self.search_task.adjust_params(task_params)
        task_params = self.search_task.design.infer_params(task_params)
        if task_params:
            status = self.search_task.design.bound_check(task_params)            
            if not status:
                return 0
        else:
            return 0

        reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)        
        if self.overuse_constraint(used_constraint):
            return 0
        if reward > self.best_reward:
            self.best_reward = reward
            self.best_reward_meta = reward_meta
            self.best_sol_cst = used_constraint
            self.best_sol = task_params
            self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
            self.last_update_epoch = self.epoch
            self.counter.update_counter('converge_time')
            self.converge_time = self.counter.get_counter('converge_time')
            self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)        
        self.best_rewards.append(self.best_reward)
        self.counter.update_counter('time')
        self.best_rewards_time.append(self.counter.get_counter('time'))
        self.epoch += 1

        return reward

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        init_points = self.params["init_points"]
        # Only test for mm task
        pbounds = {'i_t1': (1, self.search_task.workload["params"]["i"]), 'j_t1': (1, self.search_task.workload["params"]["j"]), 'k_t1': (1, self.search_task.workload["params"]["k"]),\
                   'i_t2': (1, self.search_task.workload["params"]["i"]), 'j_t2': (1, self.search_task.workload["params"]["j"]), 'k_t2': (1, min(256 // self.search_task.dw, 64, self.search_task.workload["params"]["k"]))}
        
        optimizer = BayesianOptimization(
            f=self.black_box_function,
            pbounds=pbounds,
            #verbose=1,
            random_state=1,
        )

        optimizer.maximize(
            init_points=init_points,
            n_iter=self.max_epoch - init_points,
        )

        return

'''
def opentuner_search(search_task, cst, search_obj, max_epochs, max_time, solver=1, fixed_params=None, n_worker=1, silent=0, time_out=-1, profiling=0, args=None):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {
        "args": args
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = OpenTunerInterface(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_opentuner"
            
            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class OpenTunerInterface(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name

    def init_args(self, args):
        args.bail_threshold = 500
        args.database = None
        args.display_frequency = None                
        args.generate_bandit_technique = False
        args.label = None
        args.list_techniques = False
        args.machine_class = None
        args.no_dups = True
        args.parallel_compile = False
        args.parallelism = 4
        args.pipelining = 0
        args.print_params = False
        args.print_search_space_size = False
        args.quiet = True
        args.results_log = None
        args.results_log_details = None
        args.seed_configuration = []
        if self.stop_criteria == "time":
            args.stop_after = self.max_time
        else:
            args.stop_after = None
        args.technique = None
        args.test_limit = 5000

        return args

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1
    
        opentuner_args = self.init_args(self.params["args"])
        opentuner = OpenTunerInstance(opentuner_args, self)
        opentuner.main(opentuner_args, self)

        return

class OpenTunerInstance(MeasurementInterface):
    def __init__(self, args, tuner):
        super().__init__(args)
        self.tuner = tuner

    def manipulator(self):
        """
        Define the search space by creating a
        ConfigurationManipulator
        """
        manipulator = ConfigurationManipulator()
        tuner = self.tuner

        manipulator.add_parameter(
            IntegerParameter('i_t1', 1, tuner.search_task.workload["params"]["i"]))
        manipulator.add_parameter(
            IntegerParameter('j_t1', 1, tuner.search_task.workload["params"]["j"]))
        manipulator.add_parameter(
            IntegerParameter('k_t1', 1, tuner.search_task.workload["params"]["k"]))
        manipulator.add_parameter(
            IntegerParameter('i_t2', 1, tuner.search_task.workload["params"]["i"]))
        manipulator.add_parameter(
            IntegerParameter('j_t2', 1, tuner.search_task.workload["params"]["j"]))
        manipulator.add_parameter(
            PowerOfTwoParameter('k_t2', 1, min(256 // tuner.search_task.dw, 64, tuner.search_task.workload["params"]["k"])))

        return manipulator

    def run(self, desired_result, input, limit):
        """
        Compile and run a given configuration then
        return performance
        """
        cfg = desired_result.configuration.data
        tuner = self.tuner

        x = [int(cfg['i_t1']), int(cfg['j_t1']), int(cfg['k_t1']),\
             int(cfg['i_t2']), int(cfg['j_t2']), int(cfg['k_t2'])]
        
        task_params = {}
        for p, param in tuner.search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = x[tuner.param_idx_map[param["name"]]]
        for p, param in tuner.search_task.design.params_config["external"].items():
            task_params[param["name"]] = tuner.search_task.workload["params"][param["name"]]
        task_params = tuner.search_task.adjust_params(task_params)
        task_params = tuner.search_task.design.infer_params(task_params)
        if task_params:
            status = tuner.search_task.design.bound_check(task_params)            
            if not status:
                return Result(state='ERROR', time=float('inf'))
        else:
            return Result(state='ERROR', time=float('inf'))

        reward, used_constraint, reward_meta = tuner.search_task.evaluate(task_params, tuner.search_obj)
        if tuner.overuse_constraint(used_constraint):
            return Result(state='ERROR', time=float('inf'))
        result = Result(time=1/reward)
        if reward > tuner.best_reward:
            tuner.best_reward = reward
            tuner.best_reward_meta = reward_meta
            tuner.best_sol_cst = used_constraint
            tuner.best_sol = task_params
            tuner.log(f'Epoch {tuner.epoch}: new best reward: {tuner.best_reward} ({1/tuner.best_reward:.0f})')
            tuner.last_update_epoch = tuner.epoch
            tuner.counter.update_counter('converge_time')
            tuner.converge_time = tuner.counter.get_counter('converge_time')
            tuner.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(tuner)
        tuner.best_rewards.append(tuner.best_reward)
        tuner.counter.update_counter('time')
        tuner.best_rewards_time.append(tuner.counter.get_counter('time'))
        tuner.epoch += 1

        return result

def RL_search(search_task, cst, search_obj, max_epochs, max_time, n_worker=1, silent=0, time_out=-1, profiling=0):
    if profiling:
        repeat_num = 3
    else:
        repeat_num = 1

    tuner_params = {                
        "eps": 0.0,
        "temperature": 1,
        "batch": 200
    }

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = RLTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            config_str = "_RL"
            
            config_str += f"_{search_task.design.name}"
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record    

class RLTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name

        self.agent = None
        self.env = None

    def policy_gradient(self, n_episodes=100000, max_t=1000, print_every=10, eps=0, temperature=1):
        """
        n_episodes: number of training episodes
        print_every: maximal number of episodes to keep the record
        """
        best_score = -2**20
        scores_window = deque(maxlen=print_every)
        scores = []
        has_succeed_history = False
        for i_episode in range(n_episodes):
            # Adjust learning rate
            if i_episode % 100 == 0 and has_succeed_history:
                eps /= 1.2
                temperature /= 1.01
                temperature = max(temperature, 1)
                self.agent.adjust_lr(ratio=0.8, min_lr=1e-6)
            
            score = 0
            state, infos = self.env.reset()
            # Max number of attempts in one episode
            for t in range(max_t):
                # Generate one action
                action, log_prob = self.agent.act(state, infos, eps, temperature)
                # Get rewards from the env
                next_state, reward, done, infos, sig, impt = self.env.step(action)
                # Update the agent
                self.agent.step(state, action, log_prob, reward, next_state, done, sig, impt, infos)
                state = next_state
                score += infos["reward_raw"]
                if done:
                    break
            
            scores.append(score)
            if infos["succeed"]:
                has_succeed_history = True
                if score > self.best_reward:
                    self.best_reward = score
                    self.best_reward_meta = infos["reward_meta"]
                    self.best_sol_cst = infos["cst"]
                    self.best_sol = infos["sol"]
                    self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                    self.last_update_epoch = self.epoch
                    self.counter.update_counter('converge_time')
                    self.converge_time = self.counter.get_counter('converge_time')
                    self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)                
            self.best_rewards.append(self.best_reward)
            self.counter.update_counter('time')
            self.best_rewards_time.append(self.counter.get_counter('time'))
            self.epoch += 1

        return scores

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        # Dimension of the problem space (i, j, k)
        dim_size = 3
        # Dimension of the action vector (i_t1, j_t1, k_t1, i_t2, j_t2, k_t2)
        n_action_steps = 6
        # Level of each action step
        action_size = max(self.search_task.workload["params"]["i"], 
                          self.search_task.workload["params"]["j"], 
                          self.search_task.workload["params"]["k"])
        # Initialize agent and environment 
        self.agent = RLAgent(dim_size=dim_size, n_action_steps=n_action_steps, action_size=action_size, seed=random.randint(0, 2**63), batch=self.params['batch'])
        self.env = RLEnv(self.search_task, self.cst, self.param_idx_map, self.idx_param_map, self.search_obj,
                         dim_size=dim_size, n_action_steps=n_action_steps, action_size=action_size)                
        state = self.env.reset()
        self.agent.reset()

        scores = self.policy_gradient(n_episodes=self.max_epoch, eps=self.params['eps'], temperature=self.params['temperature'])

        return
'''

def genetic_search(search_task, cst, search_obj, max_epochs, max_time, solver=1, fixed_params=None, n_worker=1, silent=0, time_out=-1, profiling=0):
    """ Genetic search
    If solver is enabled, we will first call IPOPT solver to generate the initial params to
    kick off the genetic search.
    """
    if profiling:
        solver = 1        
        repeat_num = 3
    else:
        repeat_num = 1

    init_params = None
    #solver = 0
    if solver == 1:
        # Call IPOPT solver
        init_params = off_chip_solver(search_task, cst, fixed_params, save=1)
        #init_params = off_chip_solver(search_task, cst, fixed_params)
    #print(search_task)
    #print(init_params)
    
    if init_params:
        # Modify it to divisors
        param_idx_map = {}
        idx_param_map = {}
        idx = 0
        for p, param in search_task.design.params_config["tunable"].items():
            param_idx_map[param["name"]] = idx
            idx_param_map[idx] = param["name"]
            idx += 1
        import bisect
        task_params = {}
        for p, param in search_task.design.params_config["tunable"].items():
            task_params[param["name"]] = init_params[param_idx_map[param["name"]]]
        for p, param in search_task.design.params_config["external"].items():
            task_params[param["name"]] = search_task.workload["params"][param["name"]]
        # Fix the first-level
        #for p, param in search_task.design.params_config["external"].items():
        #    split_by_param = param["split_by"]
        #    choices = utils.get_divisors(int(task_params[p]), None)
        #    idx = bisect.bisect(choices, task_params[split_by_param])
        #    if idx >= len(choices):
        #        idx -= 1
        #    if idx > 1:
        #        if abs(choices[idx - 1] - task_params[split_by_param]) < abs(choices[idx] - task_params[split_by_param]):
        #            idx -= 1
        #    task_params[split_by_param] = choices[idx]

        ## Fix the first-level: make them multiple of 4 (for solver analysis)
        #for p, param in search_task.design.params_config["external"].items():
        #    split_by_param = param["split_by"]
        #    if split_by_param.startswith("k"):
        #        task_params[split_by_param] = int(task_params[split_by_param] / 16) * 16            

        # Fix the first-level: make them multiple of 2
        for p, param in search_task.design.params_config["external"].items():
            split_by_param = param["split_by"]            
            task_params[split_by_param] = int(task_params[split_by_param] / 2) * 2

        # Fix the second-level    
        def filter_non_power_of_two(x):
            if np.log2(x) != int(np.log2(x)):
                return True
            return False
        for p, param in search_task.design.params_config["tunable"].items():        
            if "divisors" in param:
                if "tags" in param and "power_of_two" in param["tags"]:
                    choices = utils.get_divisors(int(task_params[param["divisors"][0]]), filter_non_power_of_two)
                else:
                    choices = utils.get_divisors(int(task_params[param["divisors"][0]]), None)                
                idx = bisect.bisect(choices, task_params[p])
                if idx >= len(choices):
                    idx -= 1
                if idx > 1:
                    if abs(choices[idx - 1] - task_params[p]) < abs(choices[idx] - task_params[p]):
                        idx -= 1
                task_params[p] = choices[idx]
        init_params = []
        for p, param in search_task.design.params_config["tunable"].items():
            init_params.append(task_params[param["name"]])
        #print(init_params)
        #exit(0)        
    
    # comm
    #init_params = [1024, 1024, 256, 128, 128, 4] # [1024, 1024, 320, 128, 128, 4]
    # -comp
    #init_params = [512, 512, 256, 32, 32, 4] # [520, 520, 320, 26, 26, 4]
    # comm-comp
    #init_params = [1024, 1024, 256, 64, 64, 4] # [1024, 1024, 320, 64, 64, 4]
    # imperfect pruning
    #init_params = [512, 1024, 8, 512, 512, 8]

    mutation_probs_list = [
        [0, 1, 0],
        [0.2, 0.8, 0],
        [0.4, 0.6, 0],
        [0.6, 0.4, 0],
        [0.8, 0.2, 0],
        [1, 0, 0],
        #[0, 0.8, 0.2],
        #[0, 0.6, 0.4],
        #[0, 0.4, 0.6],
        #[0, 0.2, 0.8],
        #[0, 0, 1]
    ]

    tuner_params = {
        "population_size": 200,\
        "mutation_probability": 0.5,\
        "parents_ratio": 0.3,\
        "epsilon": 0.1,\
        #"epsilon": 0,\
        "ancestor": init_params,\
        "fixed_params": fixed_params,\
        "time_out": time_out,
        "mutation_probs": mutation_probs_list[1]        
        #"mutation_probs": mutation_probs_list[0]
    }

    #print(tuner_params)

    best_record = utils.SearchRecord().reset()
    for repeat in range(repeat_num):
        tuner = GeneticTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
        tuner.search()

        search_record = tuner.best_search_record
        best_record.update(search_record)

        if profiling:
            # Mutation methods
            #config_str = ""
            #for p in tuner_params["mutation_probs"]:
            #    config_str += "_"
            #    config_str += str(p)

            # Solver
            #config_str = "_comm_div_comp"
            #config_str = "_no_solver"
            #config_str = "_comm"
            #config_str = "_comp"
            config_str = "_comm_minus_comp"

            # Hardware Model
            #config_str = "_baseline"
            #config_str = "_divisor_only"
            #config_str = "_simplified_model"

            # Search Method
            #config_str = "_genetic"            
            #config_str += f"_{search_task.design.name}"

            # Dataflow
            #config_str = f"_{search_task.workload['name']}_{search_obj}_{search_task.design.name}"
            
            config_str += f"_r{repeat}"
            with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
                fieldnames = ['epoch', 'reward', 'time']
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                for epoch in range(len(tuner.best_rewards)):
                    writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return best_record

class GeneticTuner(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.param_idx_map = {} # Maps parameter name to its index in the sample
        self.idx_param_map = {} # Maps the index to the parameter name

    def select_parents(self, population, fitness, num_parents):
        """ Select "num_parents" parents with the highest fitness score.
        """
        fitness_idx_sorted = np.argsort(-fitness)
        parents = population[fitness_idx_sorted[:num_parents]][:]
        return parents

    def crossover(self, pool, num_children):
        """ Perform single-point crossover.
        """
        children = np.empty((num_children, len(self.search_task.design.params_config["tunable"])))
        # Build the parameter dependecy chain
        param_deps = {} # ["param": "dependent_param (multiple of this parameter)"]
        param_cnt = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            if "divisors" in param:
                param_deps[param["name"]] = param["divisors"][0]
                param_cnt += 2
        if param_cnt != len(self.search_task.design.params_config["tunable"]):
            raise RuntimeError("Not all tuning parameters can be handled by crossover")
        for i in range(num_children):
            parents_idx = [i % pool.shape[0], np.random.randint(0, pool.shape[0])]
            for param in param_deps:
                idx = np.random.randint(0, 2)
                children[i][self.param_idx_map[param]] = pool[parents_idx[idx]][self.param_idx_map[param]]
                children[i][self.param_idx_map[param_deps[param]]] = pool[parents_idx[idx]][self.param_idx_map[param_deps[param]]]

        return children

    def mutation(self, pool):
        """ Perform mutation
        """
        for p_idx in range(pool.shape[0]):
            if random.random() < self.params["mutation_probability"]:
                if random.random() < self.params["epsilon"]:
                    task_params = self.search_task.generate_random_sample()
                    for i in range(pool.shape[1]):
                        pool[p_idx][i] = task_params[self.idx_param_map[i]]
                else:
                    idv = pool[p_idx][:]
                    task_params = {}
                    for p, param in self.search_task.design.params_config["tunable"].items():
                        task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                    for p, param in self.search_task.design.params_config["external"].items():
                        task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                    # Build the chains
                    # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                    split_chains = []
                    for p, param in self.search_task.design.params_config["external"].items():
                        chain = {"params": [param["name"]], "factors": []}
                        cur_param = param
                        while "split_by" in cur_param:
                            if "divisors" in self.search_task.design.params_config["tunable"][cur_param["split_by"]] \
                                and cur_param["name"] in self.search_task.design.params_config["tunable"][cur_param["split_by"]]["divisors"]:
                                div = 1
                            else:
                                div = 0
                            chain["params"].append(cur_param["split_by"])
                            if div:
                                factor = np.ceil(task_params[cur_param["name"]] / task_params[cur_param["split_by"]])
                            else:
                                factor = task_params[cur_param["name"]] / task_params[cur_param["split_by"]]
                            chain["factors"].append(max(1, int(factor)))
                            cur_param = self.search_task.design.params_config["tunable"][cur_param["split_by"]]
                        chain["factors"].append(max(1, int(task_params[cur_param["name"]])))
                        split_chains.append(chain)

                    # Mutation
                    for chain in split_chains:
                        if len(chain["factors"]) <= 1:
                            continue
                        if 'fix_param' in self.search_task.configs:
                            # Avoid mutating the fixed parameters
                            for fix_p in self.search_task.configs['fix_param']:
                                if fix_p[0] == chain['params'][0]:
                                    continue
                        src_idx, dst_idx = random.sample(range(0, len(chain["factors"])), 2)                        
                        #src_idx, dst_idx = random.sample(range(1, len(chain["factors"])), 2)
                        mutation_policy_probs = self.params["mutation_probs"]
                        mutation_policy_probs = np.cumsum(mutation_policy_probs)
                        #print(mutation_policy_probs)
                        select_prob = random.random()
                        if select_prob < mutation_policy_probs[0]:
                            # Random
                            if chain["factors"][dst_idx] == 1:
                                continue
                            """
                            inc_stride = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                            dec_stride = max(1, int(chain["factors"][dst_idx] - chain["factors"][src_idx] * chain["factors"][dst_idx] / (chain["factors"][src_idx] + inc_stride)))
                            chain["factors"][src_idx] += inc_stride
                            chain["factors"][dst_idx] -= dec_stride
                            chain["factors"][dst_idx] = max(1, chain["factors"][dst_idx])
                            """
                            #src = chain["factors"][src_idx] + max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                            src = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                            dst = max(1, math.ceil(chain["factors"][src_idx] * chain["factors"][dst_idx] / src))
                            chain["factors"][src_idx] = src
                            chain["factors"][dst_idx] = dst                        
                        elif select_prob < mutation_policy_probs[1]:
                            # Factorization
                            factor = chain["factors"][src_idx]
                            if factor == 1:
                                continue
                            divs = utils.factorization(factor)
                            div = random.choice(divs)
                            chain["factors"][src_idx] /= div
                            chain["factors"][dst_idx] *= div
                        else:
                            # Random (single)
                            chain["factors"][src_idx] = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))

                    # Revert to the params
                    # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                    for chain in split_chains:
                        factor = chain["factors"][-1]
                        param = chain["params"][-1]
                        if param in self.param_idx_map:
                            pool[p_idx][self.param_idx_map[param]] = factor
                        for idx in range(len(chain["factors"]) - 2, -1, -1):
                            param = chain["params"][idx]
                            factor *= chain["factors"][idx]
                            if param in self.param_idx_map:
                                pool[p_idx][self.param_idx_map[param]] = factor

        return pool

    def search(self):
        """ Search the design space using genetic algorithms.

        The algorithm is configured by several parameters.
        @ population_size: the number of trial solutions in each epoch.
        @ mutation_probability: the chance of each gene in each individual solution
        to be replaced by a random value.
        @ crossover_probability: the chance of an existed solution to pass its genome
        to new trial solutions.
        @ parents_ratio: the ratio of population filled by the members of the previous
        generation.
        """
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0
        # Internal testing
        #local_reward = 0

        # Init the stats
        num_pop = int(self.params["population_size"])
        num_gen = int(self.max_epoch // num_pop)
        num_parents = int(num_pop * self.params["parents_ratio"])
        self.log(f'Number of generations: {num_gen}')
        self.log(f'Number of population: {num_pop}')
        self.log(f'Number of parents: {num_parents}')

        # Init the population
        population = np.empty((num_pop, len(self.search_task.design.params_config["tunable"])), dtype=int)
        if "ancestor" in self.params and self.params["ancestor"] != None:
            # Initialize the population with the ancestor
            ancestor = self.params["ancestor"]
            task_params = {}
            idx = 0
            for p, param in self.search_task.design.params_config["external"].items():
                task_params[param["split_by"]] = ancestor[idx]
                idx += 1
            # Note: We assume only up to two-level tiling
            for p, param in self.search_task.design.params_config["external"].items():
                task_params[self.search_task.design.params_config["tunable"][param["split_by"]]["split_by"]] = ancestor[idx]
                idx += 1
            #print(task_params)
            task_params = self.search_task.adjust_params(task_params)
            #print(task_params)
            param_arr = []
            for p, param in self.search_task.design.params_config["tunable"].items():
                param_arr.append(task_params[param["name"]])
            for i in range(num_pop):
                population[i] = np.array(param_arr, dtype=int)
        else:
            # Initialize the population randomly
            pop_cnt = 0
            while pop_cnt < num_pop:
                task_params = self.search_task.generate_random_sample()
                param_arr = []
                for p, param in self.search_task.design.params_config["tunable"].items():
                    param_arr.append(task_params[param["name"]])
                population[pop_cnt] = np.array(param_arr, dtype=int)
                pop_cnt += 1
        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        fitness = np.empty(num_pop, dtype=float)

        terminate = False
        while True:
            if self.epoch > 0:
                # Select the parents
                parents = self.select_parents(population, fitness, num_parents)
                if parents.shape[0] == 0:
                    break
                # Crossover
                children = self.crossover(parents, num_pop - parents.shape[0])
                # Mutation
                children = self.mutation(children)
                # Compose the new generation
                population[0:parents.shape[0], :] = parents
                population[parents.shape[0]:, :] = children

            # Update the fitness
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                for p, param in self.search_task.design.params_config["external"].items():
                    task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                task_params = self.search_task.adjust_params(task_params)
                reward, used_constraint, reward_meta = self.search_task.evaluate(task_params, self.search_obj)
                #print(reward, used_constraint)
                #pprint.pprint(reward_meta)
                #print(task_params)
                #exit(0)
                if self.overuse_constraint(used_constraint):
                    reward = 0
                # Internal testing
                #reward_old = reward
                #if reward:
                #    latency_tmp = 0
                #    for lat in reward_meta["latency"]["latency_main"]:
                #        latency_tmp = max(latency_tmp, reward_meta["latency"]["latency_main"][lat])
                #    reward = 1 / latency_tmp

                fitness[i] = reward
                # Update the record
                if reward > self.best_reward:
                    self.best_reward = reward
                    self.best_reward_meta = reward_meta
                    self.best_sol_cst = used_constraint
                    self.best_sol = task_params
                    self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.3f})')
                    self.last_update_epoch = self.epoch
                    self.counter.update_counter('converge_time')
                    self.converge_time = self.counter.get_counter('converge_time')
                    self.best_search_record = utils.SearchRecord().extract_from_tuner_single_acc(self)
                    #print(self.best_search_record)
                    #exit(0)
                self.best_rewards.append(self.best_reward)
                self.counter.update_counter('time')
                self.best_rewards_time.append(self.counter.get_counter('time'))

                # Internal testing
                #if reward_old > local_reward:
                #    local_reward = reward_old
                #self.best_rewards.append(local_reward)

                self.epoch += 1
                self.counter.update_counter('time')
                if self.params['time_out'] > 0:
                    if self.counter.get_counter('time') - self.counter.get_counter('converge_time'):
                        # If the results are not improved after certain period of time, timeout
                        terminate = True
            if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break
            if terminate:
                break

        return

def non_fuse_genetic_search(search_task, init_tasks, cst, search_obj, max_epochs, max_time, \
                            n_worker=1, silent=0, population_size=20, policy=0, meta=None):
    """ This function finds the best array architecture for a list of tasks.
    Init_tasks include the search records for each single task.
    Policy 0: Allocate the init population based on the achieved throughput of each task.
    Policy 1: Allocate the init population uniformly.
    """
    import logging
    logger = logging.getLogger('AutoSA-Tuner')
    if silent == 0:
        logger.info("Performing cross layer non-fusion genetic search...")

    # Internal use for profiling the init population
    #logger.info('Init tasks')
    #policy = 1
    #for task in init_tasks:
    #    logger.info(f'{task.to_str()}')

    #import pickle    
    #pickle.dump(init_tasks, open(f'tmp/{search_task.design.name}_init_tasks', 'wb'))
    #init_tasks = pickle.load(open(f'tmp/{search_task.design.name}_init_tasks', 'rb'))

    # Extract the init popluation allocation information
    init_pop_record = []
    for record in init_tasks:
        task_hash = record.task_sols[0]['hash']
        init_pop_record.append({
            'latency': record.latency,
            'ops': record.task_sols[0]['ops'],
            'params': record.task_sols[0]['sol'],
            'flops': record.task_sols[0]['ops'] / record.latency
        })

    best_latency = utils.compute_tasks_latency(search_task.tasks, init_tasks)
    if silent == 0:
        logger.info(f'Cross-layer non-fusion ideal latency: {best_latency}')

    if policy == 0:
        # Sort the records by flops and prune the ones with low throughput.
        # The heuristic here is that the arch solution with higher throughput
        # can potentially deliver the best performance for the entire network.
        thres = 0.5
        def takeFLOPS(elem):
            return elem['flops']
        init_pop_record.sort(key=takeFLOPS, reverse=True)
        prune_idx = len(init_pop_record)
        prune_flops = init_pop_record[0]['flops'] * thres
        for i in range(len(init_pop_record)):
            if init_pop_record[i]['flops'] < prune_flops:
                prune_idx = i
                break
        init_pop_record = init_pop_record[:prune_idx]
    elif policy == 1:
        random.shuffle(init_pop_record)    

    tuner_params = {
        "population_size": max(population_size, len(init_pop_record)),
        "mutation_probability": 0.7,
        "parents_ratio": 0.3,
        "hw_parents_ratio": 0.1, # Maintain the best parents found by the hw models
        "epsilon": 0.05,
        "mutation_probs": [0.2, 0.8, 0],
        "policy": policy,
        "init_pop": init_pop_record,
        "unit_max_epoch": 0,
        "unit_max_time": max_time,
        "best_reward": 1 / best_latency,
        "best_reward_thres": 0.95, # Terminate if the reward is within xx% compared to the best reward
        "use_ml_model": 1,
        "model_gens": meta["xgb_params"]["n_gens"], # Switch to real estimates after every x gens
        "prune_params": {
            "reward_thres": 10, # Prune parents that is x worse than the best
            "xgb_n_turns": population_size, # Use XGBoost model after x epochs
            "xgb_thres": meta["xgb_params"]["thres"], # Prune designs below x of the ideal reward
            "xgb_thres_adjust": meta["xgb_params"]["thres_adjust"] # Adjust the updated threshold by x
        },
        "one_gen": meta["one_gen"] if meta else False # Only explore for one generation
    }    

    if max_epochs > 0:
        pass
    else:
        max_time *= (len(search_task.tasks) * tuner_params["population_size"] * 3)
        max_time = min(max_time, 180) # 3 min at most

    # Uncomment below if profiling the cost model
    #tuner_params["best_reward_thres"] = 1
    #tuner_params["prune_params"]["xbg_thres"] = 0
    #tuner_params["policy"] = 2
    #tuner_params["one_gen"] = 0
    #max_time = 1800 # 30min

    # Uncomment below if comparing methods
    #tuner_params["best_reward_thres"] = 2
    #tuner_params["use_ml_model"] = 1
    #max_time = 180 # 3min

    tuner = MultiWorkloadArrayGeneticTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
    tuner.search()

    # Uncomment below if profiling the cost model
    #np.savetxt('tmp/cost_model_samples.csv', tuner.bst_data['data'], delimiter=',')

    search_record = tuner.best_search_record
    # Internal use for method comparison
    #config_str= "thrpt_init"    
    #if tuner_params["use_ml_model"]:
    #    config_str += "_ml_"
    #    #config_str += f"{meta['xgb_params']['n_gens']}_{meta['xgb_params']['thres']}_{meta['xgb_params']['thres_adjust']}"
    #else:
    #    config_str += "_no_ml_"
    #config_str += f"{search_task.design.name}"        

    #with open(f"tmp/tuning_rewards_{config_str}.csv", "w", newline='') as f:
    #    fieldnames = ['epoch', 'reward', 'time']
    #    writer = csv.DictWriter(f, fieldnames=fieldnames)
    #    writer.writeheader()
    #    for epoch in range(len(tuner.best_rewards)):
    #        writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return search_record

class MultiWorkloadArrayGeneticTuner(GeneticTuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, params, n_worker=n_worker, silent=silent)
        self.search_cache = {} # Avoid search duplicate sample
        self.bst_data = {'num': 0, 'valid': 0, 'data': None} # Boost tree information
        self.bst = None # Boost tree
        self.gen = 0
        self.best_hw_sols = []

    def xgboost_add_sample(self, sol, cst, reward):
        """ Add the training sample into the training set.
        """
        feature = []
        for p, param in self.search_task.design.params_config['tunable'].items():
            feature.append(sol[param['name']])
        for dim in cst['dims']:
            feature.append(dim)
        feature.append(cst['SIMD'])
        feature.append(cst['resource']['BRAM18K'])
        feature.append(cst['resource']['DSP'])
        for arr in cst['data_pack']:
            for dp in cst['data_pack'][arr]:
                feature.append(dp)
        feature.append(reward)
        if self.bst_data['num'] == 0:
            self.bst_data['data'] = np.array([feature])
        else:
            self.bst_data['data'] = np.append(
                self.bst_data['data'],
                np.array([feature]), axis=0
            )

        self.bst_data['num'] += 1

    def xgboost_train(self):
        """ Train the XGBoost model.
        """
        if self.bst_data['num'] == 0:
            return

        # Build the training set
        data = self.bst_data['data'][:, :self.bst_data['data'].shape[1] - 1]
        label = self.bst_data['data'][:, self.bst_data['data'].shape[1] - 1].flatten()
        if len(label) == 0:
            return

        dtrain = xgb.DMatrix(data, label=label)
        param = {'objective':'reg:squarederror', 'nthread': 1}
        num_round = 10
        self.bst = xgb.train(param, dtrain, num_round)

        # Disable it when profiling the cost model
        if self.bst_data['num'] >= self.params['prune_params']['xgb_n_turns']:
            self.bst_data['valid'] = 1

    def xgboost_predict(self, sol, cst):
        preds = None
        if self.bst:
            feature = []
            for p, param in self.search_task.design.params_config['tunable'].items():
                feature.append(sol[param['name']])
            for dim in cst['dims']:
                feature.append(dim)
            feature.append(cst['SIMD'])
            feature.append(cst['resource']['BRAM18K'])
            feature.append(cst['resource']['DSP'])
            for arr in cst['data_pack']:
                for dp in cst['data_pack'][arr]:
                    feature.append(dp)

            data = np.array([feature])
            dtest = xgb.DMatrix(data)
            preds = self.bst.predict(dtest)[0]

        return preds

    def xgboost_prune(self, sol, cst):
        """ Prune the solution by XGBoost model
        """
        pred = self.xgboost_predict(sol, cst)
        if pred and self.bst_data['valid'] == 1:
            if pred < self.params['prune_params']['xgb_thres']:
                return True
        return False

    def select_parents(self, population, fitness, num_parents, num_hw_parents):
        """ Select "num_parents" parents with the highest fitness score.
        If num_hw_parents > 0, enlist the best hw solutions
        """
        fitness_idx_sorted = np.argsort(-fitness)
        parents = population[fitness_idx_sorted[:num_parents]][:]

        sorted_fitness = fitness[fitness_idx_sorted[:num_parents]]
        # Remove illegal parents
        cut_idx = 0
        while cut_idx < len(parents) and sorted_fitness[cut_idx] > 0:
            cut_idx += 1
        parents = parents[:cut_idx][:]

        # Remove parents with low performance
        cut_idx = 0
        while cut_idx < len(parents) and \
              sorted_fitness[cut_idx] > sorted_fitness[0] / self.params['prune_params']['reward_thres']:
            cut_idx += 1
        parents = parents[:cut_idx][:]

        # Remove redundant parents
        cur_idx = 1
        if parents.shape[0] > 1:
            while cur_idx < parents.shape[0]:
                if np.array_equal(parents[cur_idx], parents[cur_idx - 1]):
                    parents = np.delete(parents, (cur_idx), axis=0)
                else:
                    cur_idx += 1

        if num_hw_parents > 0:
            num_hw_parents = min(num_hw_parents, len(self.best_hw_sols))
            hw_parents = np.zeros((num_hw_parents, parents.shape[1]))
            for i in range(num_hw_parents):
                hw_parents[i] = self.best_hw_sols[-1 - i]["idv"]
            #print(hw_parents)
            #print(parents)
            cur_idx = 0
            while cur_idx < hw_parents.shape[0]:
                redundant = False
                for i in range(parents.shape[0]):
                    if np.array_equal(parents[i], hw_parents[cur_idx]):
                        redundant = True
                        break
                if redundant:
                    hw_parents = np.delete(hw_parents, (cur_idx), axis=0)
                else:
                    cur_idx += 1
            parents = np.concatenate((hw_parents, parents))
            parents = parents[:num_parents][:]

        return parents

    def init_population(self, num_pop):
        population = np.empty((num_pop, len(self.search_task.design.params_config["tunable"])), dtype=int)
        if self.params["policy"] in [0, 1]:
            for i in range(num_pop):
                sol = self.params["init_pop"][i % len(self.params["init_pop"])]["params"]
                param_arr = []
                for p, param in self.search_task.design.params_config["tunable"].items():
                    param_arr.append(sol[param["name"]])
                population[i] = np.array(param_arr, dtype=int)
        else:
            raise RuntimeError("Unknown policy number.")

        return population

    def hash_params(self, sol):
        """ Hash the sample to string.
        """
        hash_str = ""
        for k, v in sol.items():
            hash_str += f'{k}{v}'
        return hash_str

    def search_design(self, arch_sol, use_model=0, bst=None):
        """ Search the optimal task configuration in the fixed array.
        """
        network_search_record = utils.SearchRecord(self.max).reset()
        # Update the hardware constraints
        search_task = copy.deepcopy(self.search_task)
        arch_cst = search_task.compute_arch_cst(arch_sol)
        search_task.set_arch_cst(arch_cst)
        search_task.set_arch_sol(arch_sol)

        job_list = []
        for task in search_task.tasks:
            job_list.append({
                'job_hash': str(task), 'func': genetic_search,
                'args': [task, self.cst, self.search_obj, self.params["unit_max_epoch"], self.params["unit_max_time"], 1, None, 1, self.sub_task_silent]
            })
        pool = utils.MyExecutor(max(int(self.n_worker/2), 2))
        results = pool.exec(job_list)
        for task in search_task.tasks:
            layer_record = results[str(task)]
            network_search_record = network_search_record.append(layer_record)

        network_search_record.cst = copy.deepcopy(arch_cst["resource"])

        return network_search_record

    def search(self):
        """ Search the design space using genetic algorithms.

        The algorithm is configured by several parameters.
        @ population_size: the number of trial solutions in each epoch.
        @ mutation_probability: the chance of each gene in each individual solution
        to be replaced by a random value.
        @ crossover_probability: the chance of an existed solution to pass its genome
        to new trial solutions.
        @ parents_ratio: the ratio of population filled by the members of the previous
        generation.
        """
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        # Init the stats
        num_pop = int(self.params["population_size"])
        num_gen = int(self.max_epoch // num_pop)
        num_parents = int(num_pop * self.params["parents_ratio"])
        if self.params["use_ml_model"]:
            num_hw_parents = int(num_pop * self.params["hw_parents_ratio"])
        else:
            num_hw_parents = 0
        self.log(f'Number of generations: {num_gen}')
        self.log(f'Number of population: {num_pop}')
        self.log(f'Number of parents: {num_parents}')
        self.log(f'Number of hw parents: {num_hw_parents}')

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        # Init the population
        population = self.init_population(num_pop)
        fitness = np.empty(num_pop, dtype=float)

        terminate = False
        while True:
            # Update the fitness
            use_model = self.params["use_ml_model"] and self.bst_data['valid'] and (self.gen % self.params['model_gens'] != 0)
            if self.epoch > 0:
                if use_model:
                    num_pop = int(self.params["population_size"]) * 4
                    population = np.resize(population, (num_pop, population.shape[1]))
                    fitness = np.resize(fitness, (num_pop))
                    num_parents = int(num_pop * self.params["parents_ratio"])
                else:
                    num_pop = int(self.params["population_size"])
                    population = np.resize(population, (num_pop, population.shape[1]))
                    fitness = np.resize(fitness, (num_pop))
                    num_parents = int(num_pop * self.params["parents_ratio"])
                if self.params["use_ml_model"] and not use_model and self.bst_data['valid']:
                    num_hw_parents = int(num_pop * self.params["hw_parents_ratio"])
                else:
                    num_hw_parents = 0

                # Select the parents
                parents = self.select_parents(population, fitness, num_parents, num_hw_parents)
                if parents.shape[0] == 0:
                    break
                # Crossover
                children = self.crossover(parents, num_pop - parents.shape[0])
                # Mutation
                children = self.mutation(children)
                # Compose the new generation
                population[0:parents.shape[0], :] = parents
                population[parents.shape[0]:, :] = children
                #if use_model:
                #    print("parents:")
                #    print(parents)
                #    print("children:")
                #    print(children)

            job_list = []
            results = {}
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                for p, param in self.search_task.design.params_config["external"].items():
                    task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                # Note: XGBoost model has compatibility problem with multi-processing.
                search_task = copy.deepcopy(self.search_task)
                # Compute the architecture features
                arch_cst = search_task.compute_arch_cst(task_params)
                if not use_model:
                    if idv_hash in self.search_cache:
                        continue
                    else:
                        search_record = utils.SearchRecord(self.max).reset()
                        if arch_cst:
                            if not self.xgboost_prune(task_params, arch_cst):
                                self.search_cache[idv_hash] = {'status': 'submit', 'value': None}
                                job_list.append({
                                    'job_hash': idv_hash,
                                    'func': self.search_design,
                                    'args': [task_params, use_model, copy.deepcopy(self.bst)]})
                            else:
                                results[idv_hash] = search_record
                        else:
                            results[idv_hash] = search_record
                else:
                    reward = 0
                    if arch_cst:
                        reward = self.xgboost_predict(task_params, arch_cst)
                    results[idv_hash] = reward

            if len(job_list) > 0:
                pool = utils.MyExecutor(self.n_worker)
                pool_results = pool.exec(job_list)
                for result in pool_results:
                    results[result] = pool_results[result]

            # Update the tuner results
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                if use_model:
                    fitness[i] = results[idv_hash]
                else:
                    if idv_hash in self.search_cache and self.search_cache[idv_hash]['status'] == 'done':
                        fitness[i] = self.search_cache[idv_hash]['value']
                        continue
                    search_record = results[idv_hash]
                    if self.overuse_constraint(search_record.cst) or search_record.valid == 0:
                        search_record.reward = 0
                    #self.log(f'{search_record}')
                    if search_record.reward > 0:
                        if self.search_task.max_latency == -1 or \
                           (self.search_task.max_latency != -1 and (self.best_reward < 1 / self.search_task.max_latency)):
                           if search_record.reward > self.best_reward:
                                self.best_reward = search_record.reward
                                self.best_reward_meta = search_record.reward_meta
                                self.best_sol_cst = search_record.cst
                                self.best_sol = {"arch_sol": search_record.arch_sol, \
                                                 "task_sols": search_record.task_sols}
                                self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                                self.last_update_epoch = self.epoch
                                self.counter.update_counter('converge_time')
                                self.best_search_record = search_record
                                self.best_hw_sols.append({"idv": population[i], "reward": search_record.reward})
                                if self.best_reward >= self.params["best_reward"] * self.params["best_reward_thres"]:
                                    terminate = True
                        else:
                            # If max_latency is set, when the best search records
                            # fall less than the max_latency, the tuner will only
                            # update the records that use fewer memory resources.
                            if search_record.cst['BRAM18K'] < self.best_search_record.cst['BRAM18K']:
                                self.best_reward = search_record.reward
                                self.best_reward_meta = search_record.reward_meta
                                self.best_sol_cst = search_record.cst
                                self.best_sol = {"arch_sol": search_record.arch_sol, \
                                                 "task_sols": search_record.task_sols}
                                self.log(f'Epoch {self.epoch}: new best reward (less BRAM): {self.best_reward} ({1/self.best_reward:.0f})')
                                self.last_update_epoch = self.epoch
                                self.counter.update_counter('converge_time')
                                self.best_search_record = search_record
                                self.best_hw_sols.append({"idv": population[i], "reward": search_record.reward})
                                if self.best_reward >= self.params["best_reward"] * self.params["best_reward_thres"]:
                                    terminate = True

                    self.best_rewards.append(self.best_reward)
                    self.counter.update_counter('time')
                    self.best_rewards_time.append(self.counter.get_counter('time'))
                    fitness[i] = search_record.reward / self.params['best_reward']
                    self.search_cache[idv_hash] = {'status': 'done', 'value': fitness[i]}
                    if terminate:
                        break
                self.epoch += 1

            #if use_model:
            #    print("fitness")
            #    print(fitness)

            if self.params["one_gen"]:
                break

            if self.search_task.max_latency != -1 and self.best_search_record.latency < self.search_task.max_latency:
                break

            # Add training samples
            if not use_model and self.params["use_ml_model"]:
                for result in results:
                    search_record = results[result]
                    if self.params["best_reward"] and search_record.valid:
                        arch_cst = self.search_task.compute_arch_cst(search_record.arch_sol)
                        if search_record.reward > 0:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, search_record.reward / self.params['best_reward'])
                        else:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, 0)

            # Train the cost model
            if not use_model and self.params["use_ml_model"]:
                self.xgboost_train()
                # Adjust the cost model threshold dynamically
                if self.best_search_record.valid:
                    arch_sol = self.best_search_record.arch_sol
                    arch_cst = self.search_task.compute_arch_cst(arch_sol)
                    pred = self.xgboost_predict(arch_sol, arch_cst)
                    self.params['prune_params']['xgb_thres'] = pred * self.params['prune_params']['xgb_thres_adjust']
                    self.log(f'Updated XGB pruning thres: {self.params["prune_params"]["xgb_thres"]}')

            self.gen += 1
            # Uncomment it if profiling the cost model
            #print(self.bst_data['num'])

            if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break
            if terminate:
                break

        return

def all_fuse_genetic_search(search_task, init_tasks, cst, search_obj, max_epochs, max_time, \
                            n_worker=1, silent=0, population_size=20, policy=0, explorer=None):
    """ This function finds the best array architecture for a list of tasks.
    Init_tasks include the search records for each single task.
    All the tasks are fused.
    Policy 0: We search the best config to minimize the latency of the last task, and
    use it as the array config to search for the best config for the rest of the layers.
    Then, we perform several epochs of genetic search on top of the arch config.
    """
    import logging
    logger = logging.getLogger('AutoSA-Tuner')
    if silent == 0:
        logger.info("Performing cross layer all-fusion genetic search...")

    # If init_tasks are provided, use them as the initial population,
    # otherwise, the architecture is fixed. Use the fixed arch sol instead.
    init_pop_record = []
    best_latency = None
    if search_task.fixed == 1:
        init_pop_record.append({
            'latency': -1, 'ops': -1, 'params': search_task.arch_sol
        })
        # Try to search for the last layer under the fixed constraints and add it
        # as the candidate sample
        last_task = copy.deepcopy(search_task.tasks[-1])
        last_task.fuse = 1
        last_task.last_fuse = 1
        if last_task.use_uram:
            last_task.configs['cin_read_mode'] = 3
        else:
            last_task.configs['cin_read_mode'] = 2
        last_task.configs['cout_write_mode'] = 0
        last_task.set_aux_func('update_cin_latency', 'update_cin_latency_last')
        if last_task.use_uram == 0:
            last_task.set_aux_func('update_cin_buf', 'update_cin_buf_bram_last')
        else:
            last_task.set_aux_func('update_cin_buf', 'update_cin_buf_uram_last')
        local_silent = silent
        if silent == 0:
            local_silent = 1 if n_worker > 1 else 0
        job_list = []
        for repeat in range(3):
            job_list.append({'job_hash': f'{str(last_task)}_{repeat}', 'func': explorer.tune, \
                             'args': [last_task, None, local_silent, 0]})
        pool = utils.MyExecutor(n_worker)
        results = pool.exec(job_list)
        for r in results:
            if results[r].valid:
                init_pop_record.append({
                    'latency': -1, "ops": -1, 'params': results[r].task_sols[0]['sol']
                })
    else:
        for record in init_tasks:
            task_hash = record.task_sols[0]['hash']
            init_pop_record.append({
                'latency': record.latency,
                'ops': record.task_sols[0]['ops'],
                'params': record.task_sols[0]['sol'],
                'flops': record.task_sols[0]['ops'] / record.latency
            })

        best_latency = utils.compute_tasks_latency(search_task.tasks, init_tasks)
        if silent == 0:
            logger.info(f'Cross-layer all-fusion ideal latency: {best_latency}')

    tuner_params = {
        "population_size": max(population_size, len(init_pop_record)),
        "mutation_probability": 1.0,
        "parents_ratio": 0.2,
        "epsilon": 0.1,
        "policy": policy,
        "init_pop": init_pop_record,
        "unit_max_epoch": 0,
        "unit_max_time": max_time,
        "arch_fixed": search_task.fixed,
        "best_reward": 1 / best_latency if best_latency else None,
        "best_reward_thres": 0.95, # Terminate if the reward is within xx% compared to the best reward
        "model_gens": 10, # Switch to real estimates after every x gens
        "prune_params": {
            "reward_thres": 10, # Prune parents that is x worse than the best
            "xgb_n_turns": population_size / 2, # Use XGBoost model after x epochs
            "xgb_thres": 0.5, # Prune designs below x of the ideal reward
            "xgb_thres_adjust": 0.8 # Adjust the updated threshold by x
        }
    }

    if max_epochs > 0:
        pass
    else:
        max_time *= (len(search_task.tasks) * tuner_params["population_size"] * 3)
        #if tuner_params["arch_fixed"] == 1:
        #    max_time = min(max_time, 60) # 60 seconds at most
        #else:
        #    max_time = min(max_time, 120) # 120 seconds at most
        max_time = min(max_time, 120) # 120 seconds at most

    tuner = AllFuseGeneticTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
    tuner.search()

    search_record = tuner.best_search_record

    return search_record

class AllFuseGeneticTuner(MultiWorkloadArrayGeneticTuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, params, n_worker=n_worker, silent=silent)

    def init_population(self, num_pop):
        population = np.empty((num_pop, len(self.search_task.design.params_config["tunable"])), dtype=int)
        # Allocate uniformly
        for i in range(num_pop):
            sol = self.params["init_pop"][i % len(self.params["init_pop"])]["params"]
            param_arr = []
            for p, param in self.search_task.design.params_config["tunable"].items():
                param_arr.append(sol[param["name"]])
            population[i] = np.array(param_arr, dtype=int)

        return population

    def update_task_configs(self, tasks):
        """ Update the fusion task configurations.
        """
        for task_idx in range(len(tasks)):
            task = tasks[task_idx]
            task.fuse = 1
            if task_idx == len(tasks) - 1:
                task.last_fuse = 1
            if task_idx == 0:
                if task.use_uram == 0:
                    task.configs['cin_read_mode'] = 1 # load one time
                else:
                    task.configs['cin_read_mode'] = 0 # load in ping-pong fashion
            else:
                if task.use_uram == 0:
                    task.configs['cin_read_mode'] = 2 # load from on-chip BRAM buffers
                else:
                    task.configs['cin_read_mode'] = 3 # load from on-chip URAM buffers
            if task_idx == len(tasks) - 1:
                task.configs['cout_write_mode'] = 0 # write to off-chip memory
            else:
                task.configs['cout_write_mode'] = 1 # write to on-chip buffer
            if task_idx == len(tasks) - 1:
                task.set_aux_func('update_cin_latency', 'update_cin_latency_last')
                if task.use_uram == 0:
                    task.set_aux_func('update_cin_buf', 'update_cin_buf_bram_last')
                else:
                    task.set_aux_func('update_cin_buf', 'update_cin_buf_uram_last')
            else:
                task.set_aux_func('update_cin_latency', 'update_cin_latency')
                if task.use_uram == 0:
                    task.set_aux_func('update_cin_buf', 'update_cin_buf_bram')
                else:
                    task.set_aux_func('update_cin_buf', 'update_cin_buf_uram')

    def update_fused_task_dims(self, last_sol, last_task, cur_task, partial):
        """ Given the solution of the latter layer, update the workload dimensions of the
        current layer.
        For fused CNN, we have the or_t and oc_t from the latter layer.
        We will estimate the or_t' and oc_t' of the former layer by
        or_t' = or_t + k - 1
        oc_t' = oc_t + k - 1
        """
        if partial == 1:
            or_t = min(last_sol['r_t1'], last_task.workload['params']['r'])
            oc_t = min(last_sol['c_t1'], last_task.workload['params']['c'])
        else:
            or_t = last_task.workload['params']['r']
            oc_t = last_task.workload['params']['c']

        for tag in cur_task.workload['tags']:
            if tag.startswith('maxpool'):
                stride = int(tag.split('_')[-1])
                or_t *= stride
                oc_t *= stride
        k = cur_task.workload['params']['p']
        or_t_prev = or_t + k - 1
        oc_t_prev = oc_t + k - 1
        cur_task.workload['params']['r'] = or_t_prev
        cur_task.workload['params']['c'] = oc_t_prev

        return cur_task

    def est_latency(self, layer_stats, search_task, mode=0):
        """ Estimate the overall latency of the fused tasks.
        If mode is 1, the last task r/c are set to 1.
        """
        one_pass_latency = 0
        for task_id in range(len(search_task.tasks)):
            task = search_task.tasks[task_id]
            nxt_task_id = (task_id + 1) % len(search_task.tasks)
            if task_id == len(search_task.tasks) - 1:
                one_pass_latency += layer_stats[task_id].reward_meta['latency']['latency_main'] / \
                                    np.ceil(task.workload['params']['r'] / layer_stats[task_id].task_sols[0]['sol']['r_t1']) / \
                                    np.ceil(task.workload['params']['c'] / layer_stats[task_id].task_sols[0]['sol']['c_t1']) + \
                                    max(layer_stats[nxt_task_id].reward_meta['latency']['latency_prologue'], layer_stats[task_id].reward_meta['latency']['latency_epilogue'])
            else:
                one_pass_latency += layer_stats[task_id].reward_meta['latency']['latency_main'] + \
                                    max(layer_stats[nxt_task_id].reward_meta['latency']['latency_prologue'],
                                        layer_stats[task_id].reward_meta['latency']['latency_epilogue'])
        last_task = search_task.tasks[-1]
        if mode == 1:
            # Revert back
            last_task.workload["params"]['r'] = last_task.workload["params"]['old_r']
            last_task.workload["params"]['c'] = last_task.workload["params"]['old_c']

        total_latency = np.ceil(last_task.workload['params']['r'] / layer_stats[-1].task_sols[0]['sol']['r_t1']) * \
                        np.ceil(last_task.workload['params']['c'] / layer_stats[-1].task_sols[0]['sol']['c_t1']) * \
                        one_pass_latency
        total_latency += layer_stats[0].reward_meta['latency']['latency_prologue']

        return total_latency

    def est_off_chip_trans(self, layer_stats, search_task, mode=0):
        """ Compute the total off-chip transactions.
        """
        total_trans = 0
        one_pass_trans = 0
        for task_id in range(len(search_task.tasks) - 1):
            task = search_task.tasks[task_id]
            layer_stat = layer_stats[task_id]
            sol = layer_stat.task_sols[0]['sol']
            if task_id == 0:
                # Read cin, weights off-chip, write cout on-chip
                one_pass_trans += np.ceil(task.workload['params']['i'] / sol['i_t1']) * \
                                  np.ceil(task.workload['params']['o'] / sol['o_t1']) * \
                                  np.ceil(task.workload['params']['r'] / sol['r_t1']) * \
                                  np.ceil(task.workload['params']['c'] / sol['c_t1']) * \
                                  (sol['i_t1'] * sol['r_t1'] * sol['c_t1'] + sol['i_t1'] * sol['o_t1'] * task.workload['params']['p'] * task.workload['params']['q'])
            else:
                # Read cin on-chip, weights off-chip, write cout on-chip
                one_pass_trans += np.ceil(task.workload['params']['i'] / sol['i_t1']) * \
                                  np.ceil(task.workload['params']['o'] / sol['o_t1']) * \
                                  np.ceil(task.workload['params']['r'] / sol['r_t1']) * \
                                  np.ceil(task.workload['params']['c'] / sol['c_t1']) * \
                                  (sol['i_t1'] * sol['o_t1'] * task.workload['params']['p'] * task.workload['params']['q'])
        last_task = search_task.tasks[-1]
        if mode == 1:
            # Revert back
            last_task.workload["params"]['r'] = last_task.workload["params"]['old_r']
            last_task.workload["params"]['c'] = last_task.workload["params"]['old_c']

        total_trans = np.ceil(last_task.workload["params"]['r'] / sol['r_t1']) * \
                      np.ceil(last_task.workload["params"]['c'] / sol['c_t1']) * one_pass_trans
        # Last task, read cin on-chip, weights off-chip, write cout off-chip
        sol = layer_stats[-1].task_sols[0]['sol']
        total_trans += np.ceil(last_task.workload['params']['i'] / sol['i_t1']) * \
                       np.ceil(last_task.workload['params']['o'] / sol['o_t1']) * \
                       np.ceil(last_task.workload['params']['r'] / sol['r_t1']) * \
                       np.ceil(last_task.workload['params']['c'] / sol['c_t1']) * \
                       (sol['i_t1'] * sol['o_t1'] * task.workload['params']['p'] * task.workload['params']['q']) + \
                       np.ceil(last_task.workload['params']['o'] / sol['o_t1']) * \
                       np.ceil(last_task.workload['params']['r'] / sol['r_t1']) * \
                       np.ceil(last_task.workload['params']['c'] / sol['c_t1']) * \
                       sol['o_t1'] * sol['r_t1'] * sol['c_t1']

        return total_trans

    def search_fixed_design(self, last_layer_sol, use_model=0, bst=None):
        """ This function takes a fixed array and the solution of the last layer,
        searches the config of the rest of the layers.
        """
        network_search_record = utils.SearchRecord(self.max).reset()
        # Update the hardware constraints
        search_task = copy.deepcopy(self.search_task)
        # Update the task configs
        self.update_task_configs(search_task.tasks)

        # Update the workload parameters
        for p in search_task.tasks[-1].workload["params"]:
            last_layer_sol[p] = search_task.tasks[-1].workload["params"][p]

        last_sol = last_layer_sol
        last_task = search_task.tasks[-1]

        succeed = True
        layer_stats = []
        total_ops = 0
        for task in search_task.tasks:
            total_ops += task.compute_ops()
        # Build the record of the last layer
        reward, used_constraint, reward_meta = last_task.evaluate(last_layer_sol, self.search_obj)

        if self.overuse_constraint(used_constraint):
            reward = 0
            return network_search_record
        record = utils.SearchRecord(self.max).reset()
        record.valid = 1
        record.metric = self.search_obj
        record.cst = used_constraint
        record.reward = reward
        record.reward_meta = reward_meta
        record.latency = 1 / reward
        record.ops = last_task.compute_ops()
        record.task_names = [last_task.workload["name"]]
        record.arch_sol = last_task.arch_sol
        record.task_sols = [{
            "name": last_task.workload["name"],
            "hash": str(last_task),
            "ops": last_task.compute_ops(),
            "sol": last_layer_sol,
            "latency": record.latency,
            "DSP_eff": 0,
            #"reward_meta": reward_meta,
            "BW": 0
        }]
        record.records = None
        layer_stats.append(record)
        network_search_record = network_search_record.append(record)

        for task_idx in range(len(search_task.tasks) - 2, -1, -1):
            task = search_task.tasks[task_idx]
            # Update the task desp
            task = self.update_fused_task_dims(last_sol, last_task, task, 1 if task_idx == len(search_task.tasks) - 2 else 0)
            search_record = genetic_search(task, self.cst, self.search_obj, self.params["unit_max_epoch"], self.params["unit_max_time"], 1, None, 1, self.sub_task_silent)
            if search_record.valid == 0:
                succeed = False
                break
            last_sol = search_record.task_sols[0]['sol']
            last_task = task
            network_search_record = network_search_record.append(search_record)
            # Update the resource constraints
            if task.use_uram == 0:
                if search_record.cst["BRAM18K"] > network_search_record.cst["BRAM18K"]:
                    network_search_record.cst = search_record.cst
            else:
                if search_record.cst["URAM"] > network_search_record.cst["URAM"]:
                    network_search_record.cst = search_record.cst
            layer_stats.insert(0, search_record)

        network_search_record.fuse = 1
        if succeed:
            total_latency = self.est_latency(layer_stats, search_task)
            network_search_record.reward = 1 / total_latency
            network_search_record.latency = total_latency
        else:
            network_search_record.valid = 0

        return network_search_record

    def search_design1(self, arch_sol, use_model=0, bst=None):
        """ This function searches from the last layer, and uses the
        solution from the latter layer to allocate the fusion task of the previous layer.
        It tends to allocate large tiles for the latter layers, which may
        lead to large tiles for the early layers, resulting in no solution.
        """
        network_search_record = utils.SearchRecord(self.max).reset()
        # Update the hardware constraints
        search_task = copy.deepcopy(self.search_task)
        arch_cst = search_task.compute_arch_cst(arch_sol)
        search_task.set_arch_cst(arch_cst)
        search_task.set_arch_sol(arch_sol)

        last_sol = None
        last_task = None
        succeed = True
        layer_stats = []
        total_ops = 0
        for task in search_task.tasks:
            total_ops += task.compute_ops()

        # Update the task configs
        self.update_task_configs(search_task.tasks)

        for task_idx in range(len(search_task.tasks) - 1, -1, -1):
            task = search_task.tasks[task_idx]
            if task_idx < len(search_task.tasks) - 1:
                # Update the task desp
                task = self.update_fused_task_dims(last_sol, last_task, task, 1 if task_idx == len(search_task.tasks) - 2 else 0)
            search_record = genetic_search(task, self.cst, self.search_obj, self.params["unit_max_epoch"], self.params["unit_max_time"], 1, None, 1, self.sub_task_silent)
            if search_record.valid == 0:
                succeed = False
                break
            last_sol = search_record.task_sols[0]['sol']
            last_task = task
            network_search_record = network_search_record.append(search_record)
            # Update the resource constraints
            if task.use_uram == 0:
                if search_record.cst["BRAM18K"] > network_search_record.cst["BRAM18K"]:
                    network_search_record.cst = search_record.cst
            else:
                if search_record.cst["URAM"] > network_search_record.cst["URAM"]:
                    network_search_record.cst = search_record.cst
            layer_stats.insert(0, search_record)

        network_search_record.fuse = 1
        if succeed:
            total_latency = self.est_latency(layer_stats, search_task)
            total_off_chip_trans = self.est_off_chip_trans(layer_stats, search_task)
            network_search_record.reward = 1 / total_latency
            network_search_record.latency = total_latency
            network_search_record.ctc = total_ops / (total_off_chip_trans * search_task.dw)
        else:
            network_search_record.valid = 0

        return network_search_record

    def search_design2(self, arch_sol, use_model=0, bst=None):
        """ This function searches from the last layer, and uses the
        solution from the latter layer to allocate the fusion task of the previous layer.
        The tile size of the last layer is fixed to 1x1.
        """
        network_search_record = utils.SearchRecord(self.max).reset()
        # Update the hardware constraints
        search_task = copy.deepcopy(self.search_task)
        arch_cst = search_task.compute_arch_cst(arch_sol)
        search_task.set_arch_cst(arch_cst)
        search_task.set_arch_sol(arch_sol)

        last_sol = None
        last_task = None
        succeed = True
        layer_stats = []
        total_ops = 0
        for task in search_task.tasks:
            total_ops += task.compute_ops()

        # Update the task configs
        self.update_task_configs(search_task.tasks)

        for task_idx in range(len(search_task.tasks) - 1, -1, -1):
            task = search_task.tasks[task_idx]
            if task_idx == len(search_task.tasks) - 1:
                # Fix the r/c to 1
                task.workload["params"]['old_r'] = task.workload["params"]['r']
                task.workload["params"]['old_c'] = task.workload["params"]['c']
                task.workload["params"]['r'] = 1
                task.workload["params"]['c'] = 1
            else:
                # Update the task desp
                task = self.update_fused_task_dims(last_sol, last_task, task, 1 if task_idx == len(search_task.tasks) - 2 else 0)
            search_record = genetic_search(task, self.cst, self.search_obj, self.params["unit_max_epoch"], self.params["unit_max_time"], 1, None, 1, self.sub_task_silent)
            if search_record.valid == 0:
                succeed = False
                break
            last_sol = search_record.task_sols[0]['sol']
            last_task = task
            network_search_record = network_search_record.append(search_record)
            # Update the resource constraints
            if task.use_uram == 0:
                if search_record.cst["BRAM18K"] > network_search_record.cst["BRAM18K"]:
                    network_search_record.cst = search_record.cst
            else:
                if search_record.cst["URAM"] > network_search_record.cst["URAM"]:
                    network_search_record.cst = search_record.cst
            layer_stats.insert(0, search_record)

        network_search_record.fuse = 1
        if succeed:
            total_latency = self.est_latency(layer_stats, search_task, mode=1)
            total_off_chip_trans = self.est_off_chip_trans(layer_stats, search_task, mode=1)
            network_search_record.reward = 1 / total_latency
            network_search_record.latency = total_latency
            network_search_record.ctc = total_ops / (total_off_chip_trans * search_task.dw)
        else:
            network_search_record.valid = 0

        return network_search_record

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        # Init the stats
        num_pop = int(self.params["population_size"])
        num_gen = int(self.max_epoch // num_pop)
        num_parents = int(num_pop * self.params["parents_ratio"])
        self.log(f'Number of generations: {num_gen}')
        self.log(f'Number of population: {num_pop}')
        self.log(f'Number of parents: {num_parents}')

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        # Init the population
        population = self.init_population(num_pop)
        fitness = np.empty(num_pop, dtype=float)

        terminate = False
        while True:
            if self.epoch > 0:
                # Select the parents
                parents = self.select_parents(population, fitness, num_parents)
                if parents.shape[0] == 0:
                    break
                # Crossover
                children = self.crossover(parents, num_pop - parents.shape[0])
                # Mutation
                children = self.mutation(children)
                # Compose the new generation
                population[0:parents.shape[0], :] = parents
                population[parents.shape[0]:, :] = children

            # Update the fitness
            use_model = self.bst_data['valid'] and (self.gen % self.params['model_gens'] != 0)
            job_list = []
            results = {}
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                for p, param in self.search_task.design.params_config["external"].items():
                    task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                # Note: XGBoost model has compatibility problem with multi-processing.
                search_task = copy.deepcopy(self.search_task)
                # Compute the architecture features
                arch_cst = search_task.compute_arch_cst(task_params)
                if not use_model:
                    if idv_hash in self.search_cache:
                        continue
                    else:
                        search_record = utils.SearchRecord(self.max).reset()
                        if arch_cst:
                            if not self.xgboost_prune(task_params, arch_cst):
                                self.search_cache[idv_hash] = {'status': 'submit', 'value': None}
                                if self.params["arch_fixed"] == 0:
                                    job_list.append({
                                        'job_hash': idv_hash,
                                        'func': self.search_design1 if self.params['policy'] == 0 else self.search_design2,
                                        'args': [task_params, use_model, copy.deepcopy(self.bst)]})
                                else:
                                    job_list.append({
                                        'job_hash': idv_hash,
                                        'func': self.search_fixed_design,
                                        'args': [task_params, use_model, copy.deepcopy(self.bst)]})
                            else:
                                results[idv_hash] = search_record
                        else:
                            results[idv_hash] = search_record
                else:
                    reward = 0
                    if arch_cst:
                        reward = self.xgboost_predict(task_params, arch_cst)[0]
                    results[idv_hash] = reward

            if len(job_list) > 0:
                pool = utils.MyExecutor(self.n_worker)
                pool_results = pool.exec(job_list)
                for result in pool_results:
                    results[result] = pool_results[result]

            # Update the tuner results
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                if use_model:
                    fitness[i] = results[idv_hash]
                else:
                    if idv_hash in self.search_cache and self.search_cache[idv_hash]['status'] == 'done':
                        fitness[i] = self.search_cache[idv_hash]['value']
                        continue
                    search_record = results[idv_hash]
                    if search_record.valid == 0 or self.overuse_constraint(search_record.cst):
                        search_record.reward = 0
                    if search_record.reward > 0:
                        if search_record.reward > self.best_reward:
                            self.best_reward = search_record.reward
                            self.best_reward_meta = search_record.reward_meta
                            self.best_sol_cst = search_record.cst
                            self.best_sol = {"arch_sol": search_record.arch_sol, \
                                             "task_sols": search_record.task_sols}
                            self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                            self.last_update_epoch = self.epoch
                            self.counter.update_counter('converge_time')
                            self.best_search_record = search_record
                            if self.params["arch_fixed"] == 1:
                                if not self.params["best_reward"]:
                                    self.params["best_reward"] = search_record.reward
                            else:
                                if self.best_reward >= self.params["best_reward"] * self.params["best_reward_thres"]:
                                    terminate = True

                    self.best_rewards.append(self.best_reward)
                    if not self.params["best_reward"]:
                        if search_record.reward == 0:
                            fitness[i] = 0
                        else:
                            raise RuntimeError("Best reward is not set.")
                    else:
                        fitness[i] = search_record.reward / self.params['best_reward']
                    self.search_cache[idv_hash] = {'status': 'done', 'value': fitness[i]}
                    if terminate:
                        break
                self.epoch += 1

            # Add training samples
            if not use_model:
                for result in results:
                    search_record = results[result]
                    if self.params["best_reward"] and search_record.valid:
                        arch_cst = self.search_task.compute_arch_cst(search_record.arch_sol)
                        if search_record.reward > 0:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, search_record.reward / self.params['best_reward'])
                        else:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, 0)

            # Train the cost model
            if not use_model:
                self.xgboost_train()
                # Adjust the cost model threshold dynamically
                if self.best_search_record.valid:
                    arch_sol = self.best_search_record.arch_sol
                    arch_cst = self.search_task.compute_arch_cst(arch_sol)
                    pred = self.xgboost_predict(arch_sol, arch_cst)
                    self.params['prune_params']['xgb_thres'] = pred * self.params['prune_params']['xgb_thres_adjust']
                    self.log(f'Updated XGB pruning thres: {self.params["prune_params"]["xgb_thres"]}')

            self.gen += 1

            #exit(0)
            if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break
            if terminate:
                break

        return

def fuse_genetic_search(search_task, init_tasks, cst, search_obj, max_epochs, max_time, \
                        n_worker=1, silent=0, population_size=20, policy=0, meta=None, explorer=None):
    """ This function finds the best fused array architecture for a list of tasks.
    Init_tasks include the search records for each single task.
    """
    import logging
    logger = logging.getLogger('AutoSA-Tuner')
    if silent == 0:
        logger.info("Performing cross layer partial-fusion genetic search...")

    best_latency = utils.compute_tasks_latency(search_task.tasks, init_tasks)
    if silent == 0:
        logger.info(f'Cross-layer partial-fusion ideal latency: {best_latency}')

    thres = 0.5
    def takeFLOPS(elem):
        return elem['flops']
    multi_task_records = []
    single_task_records = []
    for record in init_tasks:
        if record.valid == 0:
            continue
        if len(record.task_sols) > 1:
            multi_task_records.append(record)
        else:
            single_task_records.append(record)
    init_pop_record = []
    for record in single_task_records:
        if record.valid == 0:
            continue
        init_pop_record.append({
            'latency': record.latency,
            'ops': record.task_sols[0]['ops'],
            'params': record.task_sols[0]['sol'],
            'flops': record.task_sols[0]['ops'] / record.latency
        })
    init_pop_record.sort(key=takeFLOPS, reverse=True)
    prune_idx = len(init_pop_record)
    prune_flops = init_pop_record[0]['flops'] * thres
    for i in range(len(init_pop_record)):
        if init_pop_record[i]['flops'] < prune_flops:
            prune_idx = i
            break
    init_pop_record = init_pop_record[:prune_idx]

    for record in multi_task_records:
        init_pop_record.insert(0, {
            'latency': record.latency,
            'ops': 0,
            'params': record.arch_sol
        })

    tuner_params = {
        "population_size": max(population_size, len(init_pop_record)),
        "mutation_probability": 1.0,
        "parents_ratio": 0.2,
        "epsilon": 0.1,
        "policy": policy,
        "init_pop": init_pop_record,
        "unit_max_epoch": 0,
        "unit_max_time": max_time,
        "explorer": explorer,
        "best_reward": 1 / best_latency if best_latency else None,
        "best_reward_thres": 0.95, # Terminate if the reward is within xx% compared to the best reward
        "model_gens": 10, # Switch to real estimates after every x gens
        "prune_params": {
            "reward_thres": 10, # Prune parents that is x worse than the best
            "xgb_n_turns": population_size / 2, # Use XGBoost model after x epochs
            "xgb_thres": 0.5, # Prune designs below x of the ideal reward
            "xgb_thres_adjust": 0.8 # Adjust the updated threshold by x
        }
    }

    if meta:
        tuner_params["fusion_candidates"] = meta["fusion_candidates"]

    if max_epochs > 0:
        pass
    else:
        max_time *= (len(search_task.tasks) * tuner_params["population_size"] * 3)
        max_time = min(max_time, 600) # 600 seconds at most

    tuner = FuseGeneticTuner(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
    tuner.search()

    search_record = tuner.best_search_record

    return search_record

class FuseDPTuner(object):
    def __init__(self, config, tasks, cst, n_worker=1):
        self.config = config
        self.tasks = tasks
        self.cst = cst
        self.n_worker = n_worker

    def hash_dp_task(self, tasks):
        ret = ""
        for task in tasks:
            ret += str(task)
        return ret

    def DP(self, cur_tasks, cut_idx):
        num_tasks = len(cur_tasks)
        search_record = utils.SearchRecord().reset()

        if num_tasks == 1:
            new_task = copy.deepcopy(cur_tasks[0])
            new_task.set_arch_cst(copy.deepcopy(self.config['arch_cst']))
            new_task.set_arch_sol(new_task.arch_sol)
            new_task.fuse = 0
            if str(new_task) in self.config['search_jobs'] and self.config['search_jobs'][str(new_task)]['done'] == 1:
                search_record = self.config['search_jobs'][str(new_task)]['search_record'].dup()
                # Correct the task names since cache is used
                search_record.task_names = [cur_tasks[0].workload["name"]]
                search_record.exec_model = [cur_tasks[0].workload["name"]]
                search_record.records = None
            else:
                # Submit the task
                self.config['search_jobs'][str(new_task)] = {'search_task': new_task, 'done': 0}
        elif cut_idx == num_tasks:
            task_names = []
            exec_model = []
            for task in cur_tasks:
                task_names.append(task.workload["name"])
                exec_model.append(task.workload["name"])
            task_names_str = ''.join(task_names)
            if "fusion_candidates" in self.config.keys():
                # Only fuse the promising candidates
                if task_names_str not in self.config['fusion_candidates']:
                    return search_record
            cur_tasks = copy.deepcopy(cur_tasks)
            new_task = MultiTask(cur_tasks[0].design, cur_tasks, self.cst, fuse=2, use_uram=self.config['explorer'].search_config["use_uram"])
            new_task.set_arch_cst(copy.deepcopy(self.config['arch_cst']))
            new_task.set_arch_sol(cur_tasks[0].arch_sol)
            if str(new_task) in self.config['search_jobs'] and self.config['search_jobs'][str(new_task)]['done'] == 1:
                search_record = self.config['search_jobs'][str(new_task)]['search_record'].dup()
                # Correct the task names since cache is used
                search_record.task_names = task_names
                search_record.exec_model = exec_model
            else:
                self.config['search_jobs'][str(new_task)] = {'search_task': new_task, 'done': 0}
        else:
            for cut_idx in range(1, num_tasks + 1):
                # Front
                front = cur_tasks[:cut_idx]
                front_hash = self.hash_dp_task(front)
                if front_hash in self.config['DP_tasks']:
                    search_record_front = self.config['DP_tasks'][front_hash].dup()
                    # Update the task names
                    task_names = []
                    for task in front:
                        task_names.append(task.workload["name"])
                    search_record_front.task_names = task_names
                else:
                    search_record_front = self.DP(front, cut_idx)
                    self.config['DP_tasks'][front_hash] = search_record_front

                if (cut_idx < num_tasks) and (self.mode == "submit" or \
                   (self.mode == "aggregate" and search_record_front.valid == 1)):
                    # Back
                    back = cur_tasks[cut_idx:]
                    back_hash = self.hash_dp_task(back)
                    if back_hash in self.config['DP_tasks']:
                        search_record_back = self.config['DP_tasks'][back_hash].dup()
                        # Update the task names
                        task_names = []
                        for task in back:
                            task_names.append(task.workload["name"])
                        search_record_back.task_names = task_names
                    else:
                        search_record_back = self.DP(back, cut_idx)
                        self.config['DP_tasks'][back_hash] = search_record_back

                    local_search_record = utils.SearchRecord().reset().merge(search_record_front, search_record_back)
                else:
                    local_search_record = search_record_front

                # Update the task names
                task_names = []
                for task in cur_tasks:
                    task_names.append(task.workload["name"])
                local_search_record.task_names = task_names
                search_record.update(local_search_record)

        return search_record

    def exec(self):
        job_list = []
        for job in self.config['search_jobs']:
            explorer = copy.deepcopy(self.config['explorer'])
            # Reduce the maximal forked processes
            explorer.search_config['n_worker'] = max(int(self.n_worker / 2), 2)
            job_list.append(
                {'job_hash': job, 'func': explorer.tune,
                 'args': [self.config['search_jobs'][job]['search_task'], None, 1, 0]}
            )
        pool = utils.MyExecutor(max(int(self.n_worker / 2), 2))
        results = pool.exec(job_list)

        for job in self.config['search_jobs']:
            self.config['search_jobs'][job]['done'] = 1
            self.config['search_jobs'][job]['search_record'] = results[job]

    def search(self):
        # Submit all DP tasks
        self.mode = "submit"
        self.DP(self.tasks, -1)
        # Execute tasks
        self.exec()
        self.config['DP_tasks'] = {}
        # Collect the results
        self.mode = "aggregate"
        search_record = self.DP(self.tasks, -1)

        return search_record

class FuseGeneticTuner(MultiWorkloadArrayGeneticTuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, params, n_worker=n_worker, silent=silent)

    def init_population(self, num_pop):
        population = np.empty((num_pop, len(self.search_task.design.params_config["tunable"])), dtype=int)
        # Allocate uniformly
        for i in range(num_pop):
            sol = self.params["init_pop"][i % len(self.params["init_pop"])]["params"]
            param_arr = []
            for p, param in self.search_task.design.params_config["tunable"].items():
                param_arr.append(sol[param["name"]])
            population[i] = np.array(param_arr, dtype=int)

        return population

    def search_design(self, arch_sol, use_model=0, bst=None):
        network_search_record = utils.SearchRecord(self.max).reset()
        # Update the hardware constraints
        search_task = copy.deepcopy(self.search_task)
        arch_cst = search_task.compute_arch_cst(arch_sol)
        search_task.set_arch_cst(arch_cst)
        search_task.set_arch_sol(arch_sol)

        # Dynamic programming
        dp_config = {
            "explorer": self.params["explorer"],
            "arch_cst": arch_cst,
            "DP_tasks": {},
            "search_jobs": {}
        }
        if "fusion_candidates" in self.params:
            dp_config["fusion_candidates"] = self.params["fusion_candidates"]

        DP_tuner = FuseDPTuner(dp_config, search_task.tasks, self.cst, self.n_worker)
        network_search_record.update(DP_tuner.search())

        return network_search_record

    def search(self):
        self.counter.init_counter('time')
        self.counter.init_counter('converge_time')
        self.epoch = 0

        # Init the stats
        num_pop = int(self.params["population_size"])
        num_gen = int(self.max_epoch // num_pop)
        num_parents = int(num_pop * self.params["parents_ratio"])
        self.log(f'Number of generations: {num_gen}')
        self.log(f'Number of population: {num_pop}')
        self.log(f'Number of parents: {num_parents}')

        idx = 0
        for p, param in self.search_task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        # Init the population
        population = self.init_population(num_pop)
        fitness = np.empty(num_pop, dtype=float)

        terminate = False
        while True:
            if self.epoch > 0:
                # Select the parents
                parents = self.select_parents(population, fitness, num_parents)
                if parents.shape[0] == 0:
                    break
                # Crossover
                children = self.crossover(parents, num_pop - parents.shape[0])
                # Mutation
                children = self.mutation(children)
                # Compose the new generation
                population[0:parents.shape[0], :] = parents
                population[parents.shape[0]:, :] = children

            # Update the fitness
            use_model = self.bst_data['valid'] and (self.gen % self.params['model_gens'] != 0)
            job_list = []
            results = {}
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                for p, param in self.search_task.design.params_config["external"].items():
                    task_params[param["name"]] = self.search_task.workload["params"][param["name"]]
                # Note: XGBoost model has compatibility problem with multi-processing.
                search_task = copy.deepcopy(self.search_task)
                # Compute the architecture features
                arch_cst = search_task.compute_arch_cst(task_params)
                if not use_model:
                    if idv_hash in self.search_cache:
                        continue
                    else:
                        search_record = utils.SearchRecord(self.max).reset()
                        if arch_cst:
                            if not self.xgboost_prune(task_params, arch_cst):
                                self.search_cache[idv_hash] = {'status': 'submit', 'value': None}
                                job_list.append({
                                    'job_hash': idv_hash,
                                    'func': self.search_design,
                                    'args': [task_params, use_model, copy.deepcopy(self.bst)]})
                            else:
                                results[idv_hash] = search_record
                        else:
                            results[idv_hash] = search_record
                else:
                    reward = 0
                    if arch_cst:
                        reward = self.xgboost_predict(task_params, arch_cst)[0]
                    results[idv_hash] = reward

            if len(job_list) > 0:
                pool = utils.MyExecutor(max(int(self.n_worker / 2), 2))
                pool_results = pool.exec(job_list)
                for result in pool_results:
                    results[result] = pool_results[result]

            # Update the tuner results
            for i in range(num_pop):
                idv = population[i]
                task_params = {}
                for p, param in self.search_task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                idv_hash = self.hash_params(task_params)
                if use_model:
                    fitness[i] = results[idv_hash]
                else:
                    if idv_hash in self.search_cache and self.search_cache[idv_hash]['status'] == 'done':
                        fitness[i] = self.search_cache[idv_hash]['value']
                        continue
                    search_record = results[idv_hash]
                    if self.overuse_constraint(search_record.cst) or search_record.valid == 0:
                        search_record.reward = 0
                    if search_record.reward > 0:
                        if search_record.reward > self.best_reward:
                            self.best_reward = search_record.reward
                            self.best_reward_meta = search_record.reward_meta
                            self.best_sol_cst = search_record.cst
                            self.best_sol = {"arch_sol": search_record.arch_sol, \
                                             "task_sols": search_record.task_sols}
                            self.log(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                            self.last_update_epoch = self.epoch
                            self.counter.update_counter('converge_time')
                            # Update the DSP eff
                            search_record.dsp_eff = self.search_task.compute_dsp_eff(search_record.latency, search_record.cst["DSP"])
                            self.best_search_record = search_record
                            if self.best_reward >= self.params["best_reward"] * self.params["best_reward_thres"]:
                                terminate = True

                    self.best_rewards.append(self.best_reward)
                    fitness[i] = search_record.reward / self.params['best_reward']
                    self.search_cache[idv_hash] = {'status': 'done', 'value': fitness[i]}
                self.epoch += 1

            # Add training samples
            if not use_model:
                for result in results:
                    search_record = results[result]
                    if self.params["best_reward"] and search_record.valid:
                        arch_cst = self.search_task.compute_arch_cst(search_record.arch_sol)
                        if search_record.reward > 0:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, search_record.reward / self.params['best_reward'])
                        else:
                            self.xgboost_add_sample(search_record.arch_sol, arch_cst, 0)

            # Train the cost model
            if not use_model:
                self.xgboost_train()
                # Adjust the cost model threshold dynamically
                if self.best_search_record.valid:
                    arch_sol = self.best_search_record.arch_sol
                    arch_cst = self.search_task.compute_arch_cst(arch_sol)
                    pred = self.xgboost_predict(arch_sol, arch_cst)
                    self.params['prune_params']['xgb_thres'] = pred * self.params['prune_params']['xgb_thres_adjust']
                    self.log(f'Updated XGB pruning thres: {self.params["prune_params"]["xgb_thres"]}')

            self.gen += 1

            if self.stop_criteria == "epoch" and self.epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break
            if terminate:
                break

        return

def multi_acc_search1(search_task, init_tasks, cst, search_obj, max_epochs, max_time, \
                      n_worker=1, silent=0, population_size=20, policy=0, meta=None, explorer=None, profiling=0):
    """ This function finds the best multi-array architecture for a list of tasks.
    """
    import logging
    logger = logging.getLogger('AutoSA-Tuner')
    if silent == 0:
        logger.info("Performing cross layer multi-accelerator genetic search...")

    best_latency = utils.compute_tasks_latency(search_task.tasks, init_tasks)
    if silent == 0:
        logger.info(f'Cross-layer multi-accelerator ideal latency: {best_latency}')

    partition_candidates = meta["partition_candidates"]

    tuner_params = {
        "explorer": explorer,
        "probe_points": meta["init_partition_candidates"],
        "best_reward": 1 / best_latency if best_latency else None,
        "partition_candidates": partition_candidates,
        "batch_size": meta["batch_size"],
        "use_uram_all": meta["use_uram_all"],
        "dsp_eff_thres": 0.85, # If the DSP eff is greater than this thres, no fine-tuning is required.
        "latency_stdev_thres": 0.03,
        "reward_stdev_thres": 0.025,
        "max_trial": 3 # Terminate fine-tuning after more than x trials
    }
    if meta:
        tuner_params["design_idx_list"] = meta['design_idx_list']

    if max_epochs > 0:
        pass
    else:
        max_time = 3600 # 60 minutes at most

    tuner = MultiAccTuner1(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
    tuner.search()

    search_record = tuner.best_search_record

    # For internal testing
    now = datetime.now()
    config_str = f"_{explorer.search_config['workload']}_multi1"
    with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
        fieldnames = ['epoch', 'reward', 'time']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        #for epoch in range(len(tuner.best_rewards)):
        for epoch in range(len(tuner.bayopt_best_rewards)):
            writer.writerow({'epoch': epoch, 'reward': tuner.bayopt_best_rewards[epoch], 'time': tuner.bayopt_best_rewards_time[epoch]})

    return search_record

class MultiAccTuner1(Tuner):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, n_worker=n_worker, silent=silent)
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter()
        self.bayopt_epoch = 0
        self.bayopt_best_rewards = []
        self.bayopt_best_rewards_time = []

        self.search_cache = {} # Store searc records
        self.search_cache_cst = {}
        self.bay_search_log = {} # Bayesian search log

    def resource_alloc(self, partition):
        """ Allocate initial DSP/BRAM limit.
        The highest throughput is achieved when each array has a similar latency.
        At the ideal case,
        ops1/#DSP1 = ops2/#DSP2 = ...
        The initial DSP is then allocated based on the #ops of each array.
        #DSPi = opsi/ops_total * #DSP_total
        """
        DSP_total = self.cst.hw_cst['DSP']
        BRAM_total = self.cst.hw_cst['BRAM18K']
        array_ops = []
        for p in partition:
            cur_ops = 0
            for idx in p:
                cur_ops += self.search_task.tasks[idx].compute_ops()
            array_ops.append(cur_ops)

        total_ops = sum(array_ops)
        DSP_alloc = [int(n / total_ops * DSP_total) for n in array_ops]

        if len(partition) == 1:
            step = 1
        else:
            step = pow(2, int(np.log2(len(partition))) + 1)
        BRAM18K_alloc = [self.cst.hw_cst['BRAM18K'] / step for n in array_ops]
        #BRAM18K_alloc = [self.cst.hw_cst['BRAM18K'] for n in array_ops]

        return {"DSP": DSP_alloc, "BRAM18K": BRAM18K_alloc, "state": 0}

    def est_URAM(self, records):
        URAM_total = 0
        for i in range(len(records)):
            record = records[i]
            URAM_total += record.cst["URAM"]

        return URAM_total

    def est_mem(self, partition, records, verbose=0):
        """ Estimate the total BRAM18K usage.
        BRAM18K is consumed by two parts: arrays and streaming buffers in-between.
        For two adjacent arrays, suppose their tiling factors as:
        [tr1, tc1, to1, ti1] and [tr2, tc2, to2, ti2]
        Compute the tiling factors such that:
        tr' = c0 * tr1
        tc' = c1 * tc1
        (c0 - 1) * tr1 < tr2 + k - 1 <= c0 * tr1
        (c1 - 1) * tc1 < tc2 + k - 1 <= c1 * tc1
        Streaming buffers are allocated to hold at least:
        tr' * tc' * o1(i2) * 2
        such that when the second array is using the first block of (tr2 + k - 1) * ... * i2,
        the first array will continue to fill the rest of the buffer for the next round.
        If verbose is set to 1, return the detailed resource usage of each array and streaming buffer.
        """
        array_bufs = []
        stream_bufs = []
        BRAM18K_total = 0
        URAM_total = 0
        # array bufs
        for i in range(len(records)):
            record = records[i]
            BRAM18K_total += record.cst["BRAM18K"]
            array_bufs.append(record.cst["BRAM18K"])

        # array bufs
        if self.params["use_uram_all"]:
            for i in range(len(records)):
                if len(partition[i]) == 1:
                    continue
                # Allocate URAMs to store the intermediate results for multi-task array
                URAM_tmp = 0
                for layer_idx in partition[i]:
                    o = self.search_task.tasks[layer_idx].workload['params']['o']
                    r = self.search_task.tasks[layer_idx].workload['params']['r']
                    c = self.search_task.tasks[layer_idx].workload['params']['c']
                    data_pack = records[i].task_sols[0]['sol']['i_t2']
                    ele_num = o * r * c
                    URAM_tmp = max(URAM_tmp, 2 * np.ceil(self.search_task.dw * data_pack * 8 / 72) * np.ceil(ele_num / data_pack / 4096))
                URAM_total += URAM_tmp

        # streaming bufs
        for i in range(1, len(records)):
            array1 = records[i - 1]
            array2 = records[i]
            # Streaming buffers are only inserted between single-task array.
            if len(array1.task_names) > 1 or len(array2.task_names) > 1:
                continue
            layer1_idx = partition[i - 1][0]
            layer2_idx = partition[i][0]
            # Extract parameters of array 1
            o1 = self.search_task.tasks[layer1_idx].workload['params']['o']
            tr1 = min(array1.task_sols[0]['sol']['r_t1'], self.search_task.tasks[layer1_idx].workload['params']['r'])
            tc1 = min(array1.task_sols[0]['sol']['c_t1'], self.search_task.tasks[layer1_idx].workload['params']['c'])
            for tag in self.search_task.tasks[layer1_idx].workload['tags']:
                if tag.startswith('maxpool'):
                    stride = int(tag.split('_')[-1])
                    tr1 /= stride
                    tc1 /= stride
            tr1 = max(int(tr1), 1)
            tc1 = max(int(tc1), 1)
            # Extract parameters of array 2
            tr2 = min(array2.task_sols[0]['sol']['r_t1'], self.search_task.tasks[layer2_idx].workload['params']['r'])
            tc2 = min(array2.task_sols[0]['sol']['c_t1'], self.search_task.tasks[layer2_idx].workload['params']['c'])
            k = self.search_task.tasks[layer2_idx].workload['params']['p']
            data_pack = array2.task_sols[0]['sol']['i_t2']
            # Compute the BRAM size
            c0 = np.ceil((tr2 + k - 1) / tr1)
            c1 = np.ceil((tc2 + k - 1) / tc1)
            array1_params = self.search_task.tasks[layer1_idx].workload["params"]
            array2_params = self.search_task.tasks[layer2_idx].workload["params"]
            trp = min(c0 * tr1, array1_params['r'])
            tcp = min(c1 * tc1, array1_params['c'])
            #ele_num = trp * tcp * o1 * 2
            ele_num = min(trp * array1_params['c'] * o1, tcp * array1_params['r'] * o1)
            #buffer = np.ceil(self.search_task.dw * data_pack * 8 / 36) * np.ceil(ele_num / data_pack / 512)
            buffer = np.ceil(self.search_task.dw * data_pack * 8 / 72) * np.ceil(ele_num / data_pack / 4096)
            stream_bufs.append(buffer)
            #BRAM18K_total += buffer
            URAM_total += buffer

        if verbose == 0:
            return {"BRAM18K": BRAM18K_total, "URAM": URAM_total}, None            
        else:
            return {"BRAM18K": BRAM18K_total, "URAM": URAM_total}, {"array_bufs": array_bufs, "stream_bufs": stream_bufs}            

    def overuse_resource(self, partition, records):
        for record in records:
            if record.valid == 0:
                return True
        #BRAM18K = self.est_BRAM18K(partition, records)
        mem, meta = self.est_mem(partition, records)
        #URAM = self.est_URAM(records)
        DSP = 0
        for record in records:
            DSP += record.cst["DSP"]
        BRAM18K = mem["BRAM18K"]
        URAM = mem["URAM"]
        if BRAM18K > self.cst.hw_cst["BRAM18K"]:
            return True
        if URAM > self.cst.hw_cst["URAM"]:
            return True
        if DSP > self.cst.hw_cst["DSP"]:
            return True

        return False

    def est_resource(self, partition, records):
        #BRAM18K = self.est_BRAM18K(partition, records)
        #URAM = self.est_URAM(records)
        mem, meta = self.est_mem(partition, records)
        DSP = 0
        for record in records:
            DSP += record.cst["DSP"]

        return {"DSP": DSP, "BRAM18K": mem["BRAM18K"], "URAM": mem["URAM"]}

    def est_latency(self, partition, records, in_place=0, adjust=0, verbose=0):
        """ Compute the latency of the design.
        Single-task arrays are adjusted to start as long as the first batch of data
        are ready in the streaming buffer.
        Multi-task array will wait until the previous array finishes.
        Any arrays following the multi-task array will also wait for the previous array to complete.
        If in_place is set to 1, records latency will be updated.
        If adjust is set to 1, we will consider the possible stall between arrays.
        """
        array_latency = [records[0].latency * self.params["batch_size"]]
        setup_latency = [0]
        record_latency = [r.latency for r in records]

        # Update array and setup latency
        for i in range(1, len(records)):
            array1 = records[i - 1]
            array2 = records[i]
            if len(array1.task_names) > 1 or len(array2.task_names) > 1:
                # One of the arrays is a multi-task array
                # Start only if the previous one finishes
                setup = array_latency[-1]
                setup_latency.append(setup)

                array_latency.append(max(record_latency[i] * self.params["batch_size"], array_latency[i - 1]))
            else:
                # Both arrays are single-task arrays
                # Start as long as the first block of data is ready
                layer1_idx = partition[i - 1][0]
                layer2_idx = partition[i][0]
                # Extract parameters of array 1
                o1 = self.search_task.tasks[layer1_idx].workload['params']['o']
                tr1 = min(array1.task_sols[0]['sol']['r_t1'], self.search_task.tasks[layer1_idx].workload['params']['r'])
                tc1 = min(array1.task_sols[0]['sol']['c_t1'], self.search_task.tasks[layer1_idx].workload['params']['c'])
                tr1_post = tr1
                tc1_post = tc1
                for tag in self.search_task.tasks[layer1_idx].workload['tags']:
                    if tag.startswith('maxpool'):
                        stride = int(tag.split('_')[-1])
                        tr1_post /= stride
                        tc1_post /= stride
                tr1_post = max(int(tr1_post), 1)
                tc1_post = max(int(tc1_post), 1)
                # Extract parameters of array 2
                tr2 = min(array2.task_sols[0]['sol']['r_t1'], self.search_task.tasks[layer2_idx].workload['params']['r'])
                tc2 = min(array2.task_sols[0]['sol']['c_t1'], self.search_task.tasks[layer2_idx].workload['params']['c'])
                k = self.search_task.tasks[layer2_idx].workload['params']['p']
                data_pack = array2.task_sols[0]['sol']['i_t2']

                c0 = np.ceil((tr2 + k - 1) / tr1_post)
                c1 = np.ceil((tc2 + k - 1) / tc1_post)
                array1_params = self.search_task.tasks[layer1_idx].workload["params"]
                array2_params = self.search_task.tasks[layer2_idx].workload["params"]
                trp = min(c0 * tr1, array1_params['r'])
                tcp = min(c1 * tc1, array1_params['c'])
                # Setup latency
                #setup = record_latency[i - 1] / (np.ceil(array1_params['r'] / trp) * np.ceil(array1_params['c'] / tcp))
                if trp > tcp:
                    setup = record_latency[i - 1] / np.ceil(array1_params['c'] / tcp)
                else:
                    setup = record_latency[i - 1] / np.ceil(array1_params['r'] / trp)
                setup_latency.append(setup)

                # Adjust the array latency
                if adjust:
                    raise RuntimeError("Array latency adjust for multi-array is not implemented.")
                    '''
                    # Consider the fine-grained produce-consume relationship
                    n_fill_rounds = np.ceil((min(2 * tr2 + k - 1, array1_params['r'] + k - 1) - c0 * tr1_post) / tr1_post) * c1
                    fill_latency = array_latency[-1] / (np.ceil(array1_params['r'] / tr1 * np.ceil(array1_params['c'] / tc1))) * n_fill_rounds
                    consume_latency = record_latency[i] / (np.ceil(array2_params['r'] / tr2 * np.ceil(array2_params['c'] / tc2)))
                    adjusted_latency = max(fill_latency, consume_latency) * np.ceil(array2_params['r'] / tr2) * np.ceil(array2_params['c'] / tc2)
                    record_latency[i] = adjusted_latency
                    array_latency.append(adjusted_latency)
                    '''
                else:
                    # Simply compute the max
                    array_latency.append(max(record_latency[i] * self.params["batch_size"], array_latency[i - 1]))

        if in_place:
            # Update the array latency
            for i in range(len(records)):
                records[i].latency = array_latency[i]

        # Compute the latency
        design_latency = 0
        for lat in setup_latency:
            design_latency += lat
        design_latency += array_latency[-1]

        # Throughput
        max_latency = 0
        for latency in array_latency:
            if latency > max_latency:
                max_latency = latency
        throughput = 1 / max_latency * self.params["batch_size"]

        return design_latency, throughput, None

    def est_dsp_eff(self, throughput, cst):
        total_ops = 0
        for task in self.search_task.tasks:
            total_ops += task.compute_ops()
        # Note: Only works for FP32
        dsp_eff = throughput / (cst["DSP"] / 5 * 2 / total_ops)

        return dsp_eff

    def evaluate(self, partition, records, verbose=0):
        latency, throughput, meta = self.est_latency(partition, records, verbose=verbose)
        #latency, throughput = self.est_latency(partition, records)
        resource = self.est_resource(partition, records)
        return latency, resource, throughput, meta

    def is_finetune_required(self, records, dsp_eff):
        """ Check if finetuning is required.
        """
        # If DSP efficiency is higher than the thres, stop
        if dsp_eff >= self.params["dsp_eff_thres"]:
            return False

        return True

    def resource_alloc_adjust(self, partition, resource_alloc, records, overuse_mem):
        """ Adjust the resource allocation.
        State 0: Try to allocate all the available resource to the bottleneck design.
        If the resource allocation leads to memory overuse, reduce the resource allocated
        to the bottleneck design graduualy until a legal one is found.
        Switch to state 0.5 afterwards.
        If the first attempt leads to a legal design while the bottleneck design remains
        the bottleneck, switch to state 1.

        State 0.5 (deprecated): Intermediate state. Simply try to increase the resource allocation.
        If succeeds, switch back to state 0, otherwise, switch to state 1.
        This state is set considering the instability of the search results, i.e.,
        re-run the searching for more arrays might lead to a feasible solution.

        State 1: Borrow resource from fastest designs to the bottleneck design.
        We keep a cache to store all the past records for different arrays with
        different resource allocation. This cache is prioritized when selecting
        the reduced resource allocation.
        In the case when no such option is found in the search log, simply reducing the
        resource usage by a fixed amount.

        "records" is the best feasible array records found so far.
        "overuse_mem" indicates if the last attempt leads to memory overutilization.
        """
        # Calculate the available resource
        available_dsp = self.cst.hw_cst["DSP"]
        available_bram = self.cst.hw_cst["BRAM18K"]
        if resource_alloc["state"] in [0, 0,5]:
            available_dsp -= resource_alloc['init']['DSP_total']
            available_bram -= resource_alloc['init']['BRAM18K_total']
        else:
            resource = self.est_resource(partition, records)
            available_dsp -= resource['DSP']
            available_bram -= resource['BRAM18K']

        slow_idx_list = resource_alloc["slow_idx"]
        fast_idx_list = resource_alloc["fast_idx"]

        inc_dsp = 0
        inc_bram = 0
        dec_dsp = 0
        dec_bram = 0

        # State transition
        if resource_alloc["state"] == 0:
            if resource_alloc["decrease"][0] == 1 and not overuse_mem:
                #resource_alloc["state"] = 0.5 # Stale state for one more attempt
                resource_alloc["state"] = 1
            if resource_alloc["n_adjust"][0] == 1 and not overuse_mem:
                # Allocate all the available resource is insufficent
                resource_alloc["state"] = 1
        elif resource_alloc["state"] == 0.5:
            if resource_alloc["decrease"][0] == 0 and not overuse_mem:
                resource_alloc["state"] = 0
            else:
                resource_alloc["state"] = 1

        if resource_alloc["state"] in [0, 0.5]:
            if resource_alloc["n_adjust"][0] > 1:
                if overuse_mem == 0:
                    # Increase the lower bound
                    resource_alloc["step"][0][0] = sum(resource_alloc["step"][0]) / 2
                else:
                    # Decrease faster
                    resource_alloc["step"][0][1] = sum(resource_alloc["step"][0]) / 4

            if resource_alloc["n_adjust"][0] == 0:
                # At the first attempt, allocate all the available resource to the bottleneck design
                ratio = resource_alloc["step"][0][1]
            else:
                ratio = sum(resource_alloc["step"][0]) / 2
            inc_dsp = available_dsp
            inc_bram = int(available_bram * ratio)
            resource_alloc["DSP"][slow_idx_list[0]] = resource_alloc['init']['DSP'][slow_idx_list[0]] + inc_dsp
            resource_alloc["BRAM18K"][slow_idx_list[0]] = resource_alloc['init']['BRAM18K'][slow_idx_list[0]] + inc_bram

            if resource_alloc["n_adjust"][0] > 0:
                if inc_bram > resource_alloc["history"][0]:
                    resource_alloc["decrease"][0] = 0
                else:
                    resource_alloc["decrease"][0] = 1
            resource_alloc["history"][0] = inc_bram
            if inc_bram == 0:
                resource_alloc["state"] = 1

        if resource_alloc["state"] == 1:
            # Calculate the available resource
            available_dsp = self.cst.hw_cst["DSP"]
            available_bram = self.cst.hw_cst["BRAM18K"]
            resource = self.est_resource(partition, records)
            available_dsp -= resource['DSP']
            available_bram -= resource['BRAM18K']

        if resource_alloc["state"] == 1:
            cur_adjust_thres = len(fast_idx_list) * 2
            update_idx_select = {}
            while True:
                total_adjust_num = 0 # Number of successfully adjusted arrays
                if cur_adjust_thres == 0:
                    break
                for idx in fast_idx_list:
                    history = self.search_cache[idx]
                    def take_latency(record):
                        return record.latency
                    history.sort(key=take_latency)
                    # Compute the latency upper bound to adjust this array
                    ub_latency = (records[slow_idx_list[0]].latency - records[idx].latency) / (cur_adjust_thres + 1) + records[idx].latency
                    #print("adjust ub latency: ", ub_latency)

                    # Decrease the memory allocation for this design to increase array latency up to ub_latency
                    min_mem = records[idx].cst["BRAM18K"]
                    update_idx = -1
                    for history_idx in range(len(history)):
                        r = history[history_idx]
                        if r.latency > records[slow_idx_list[0]].latency:
                            break
                        if r.latency >= records[idx].latency and r.latency <= ub_latency:
                            if r.cst["BRAM18K"] < min_mem:
                                min_mem = r.cst["BRAM18K"]
                                update_idx = history_idx
                    if update_idx != -1:
                        total_adjust_num += 1
                    update_idx_select[idx] = update_idx
                if total_adjust_num < min(len(fast_idx_list), 2):
                    # Adjust at least two arrays each time
                    # If not enough candidate arrays are found, try to loose the upper bound
                    cur_adjust_thres -= 1
                else:
                    break
            for idx in fast_idx_list:
                history = self.search_cache[idx]
                def take_latency(record):
                    return record.latency
                history.sort(key=take_latency)
                dec_bram_single = 0
                dec_dsp_single = 0
                if update_idx_select[idx] != -1:
                    #print("cur, selected records constraints: ", records[idx].cst["BRAM18K"], r.cst["BRAM18K"])
                    #print("cur, selected records latency: ", records[idx].latency, r.latency)
                    r = history[update_idx_select[idx]]
                    dec_bram_single = (records[idx].cst["BRAM18K"] - r.cst["BRAM18K"])
                    dec_dsp_single = (records[idx].cst["DSP"] - r.cst["DSP"])
                    resource_alloc["DSP"][idx] = r.cst["DSP"]
                    resource_alloc["BRAM18K"][idx] = r.cst["BRAM18K"]
                dec_bram += dec_bram_single
                dec_dsp += dec_dsp_single
            if dec_bram == 0:
                # No available records found in the search cache.
                # We will force fast designs to spare resource to the bottleneck design.
                dec_dsp = 0
                for idx in fast_idx_list:
                    limit_ratio = min((1 - records[idx].latency / records[slow_idx_list[0]].latency) / 8, resource_alloc["step"][1][0])
                    dec_bram_single = records[idx].cst["BRAM18K"] * limit_ratio
                    dec_dsp_single = records[idx].cst["DSP"] * limit_ratio / 2
                    resource_alloc["DSP"][idx] = records[idx].cst["DSP"] - dec_dsp_single
                    resource_alloc["BRAM18K"][idx] = records[idx].cst["BRAM18K"] - dec_bram_single
                    dec_bram += dec_bram_single
                    dec_dsp += dec_dsp_single

            resource_alloc["DSP"][slow_idx_list[0]] = records[slow_idx_list[0]].cst['DSP'] + (dec_dsp + available_dsp)
            resource_alloc["BRAM18K"][slow_idx_list[0]] = records[slow_idx_list[0]].cst['BRAM18K'] + (dec_bram + available_bram)
            if resource_alloc["n_adjust"][1] > 0:
                if resource_alloc["BRAM18K"][slow_idx_list[0]] > resource_alloc["history"][1]:
                    resource_alloc["decrease"][1] = 0
                else:
                    resource_alloc["decrease"][1] = 1
            resource_alloc["history"][1] = resource_alloc["BRAM18K"][slow_idx_list[0]]
            if resource_alloc["decrease"][1] == 1 and not overuse_mem:
                # Stop further tuning
                return False

        resource_alloc["n_adjust"][math.floor(resource_alloc["state"])] += 1
        # Only try at most 3 times for each state
        if resource_alloc["n_adjust"][0] > 3 or resource_alloc["n_adjust"][1] > 3:
            return False

        return True

    def update_bottleneck_idx(self, records):
        """ Return the slowest/fastest design index.
        Select up to len(records) - 1 fast designs.
        Select 1 slow design.
        """
        slow = {'latency': 0, 'idx': []}
        fast = {'latency': float("inf"), 'idx': []}
        for i in range(len(records)):
            record = records[i]
            if record.latency < fast['latency']:
                fast['latency'] = record.latency
                fast['idx'] = [i]
            if record.latency > slow['latency']:
                slow['latency'] = record.latency
                slow['idx'] = [i]

        list_len = len(records) - 1
        for i in range(len(records)):
            if i in fast['idx']:
                continue
            record = records[i]
            if abs((record.latency - fast['latency']) / fast['latency']) < 0.05 and len(fast['idx']) < list_len:
                fast['idx'].append(i)
        if len(fast["idx"]) == 1 and list_len > 1:
            # Add one more into the list
            fast_val = float("inf")
            idx = -1
            for i in range(len(records)):
                record = records[i]
                if i == fast['idx'][0]:
                    continue
                if record.latency < fast_val:
                    fast_val = record.latency
                    idx = i
            fast['idx'].append(idx)

        list_len = 1
        for i in range(len(records)):
            if i in slow['idx']:
                continue
            record = records[i]
            if abs((record.latency - slow['latency']) / slow['latency']) < 0.02 and len(slow['idx']) < list_len:
                slow['idx'].append(i)

        return slow["idx"], fast["idx"]

    def find_legal_config(self, partition, resource_alloc, old_records=None, adjust_func=None, fine_tune=0, skip_search=1):
        """ Find a legal configuration given the resource allocation.
        If "skip_search" is set to 1, only re-search the designs in the slow/fast idx list.
        If "fine_tune" is set to 1, we will adjust the resource allocation using
        "adjust_func" until the bottleneck array changes or there is no valid resource
        allocation found.
        Otherwise, the current function will gradually reduce the BRAM allocation until a
        valid design is found.
        """
        legal_records = old_records
        best_throughput = 0
        is_first = True
        n_arrays = len(partition)
        # Maintain a list of several best designs for each array
        history = [[] for i in range(n_arrays)]
        history_thres = 2
        if n_arrays > 10:
            # Avoid storing too many configs
            history_thres = 1

        single_task_arrays = []
        multi_task_arrays = []
        for i in range(n_arrays):
            if len(partition[i]) == 1:
                single_task_arrays.append(i)
            else:
                multi_task_arrays.append(i)

        while True:
            # For internal testing
            #pprint.pprint(resource_alloc)
            records = []
            skip_idx = []
            job_list = []
            tasks = []
            # single task arrays
            for i in single_task_arrays:
                # Update the history
                history_tmp = []
                for record in history[i]:
                    if record.cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       record.cst["DSP"] <= resource_alloc["DSP"][i]:
                       history_tmp.append(record)
                if legal_records and is_first:
                    if legal_records[i].cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       legal_records[i].cst["DSP"] <= resource_alloc["DSP"][i]:
                       history_tmp.append(legal_records[i])
                       self.search_cache[i].append(legal_records[i])
                history[i] = history_tmp
                if skip_search == 1:
                    if (i not in resource_alloc["slow_idx"]) and (i not in resource_alloc["fast_idx"]) and len(history[i]) > 0:
                        skip_idx.append(i)
                        continue
                # Submit the search job
                explorer_tmp = copy.deepcopy(self.params["explorer"])
                # Update the constraints
                explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                explorer_tmp.cst.hw_cst["BRAM18K"] = resource_alloc["BRAM18K"][i]
                array_tasks = []
                for design_idx in self.params["design_idx_list"]:
                    search_task = SingleTask(explorer_tmp.designs[design_idx], self.search_task.tasks[partition[i][0]].workload, explorer_tmp.cst)
                    # Update the task configs
                    if i == 0:
                        # Load from DRAM
                        search_task.configs["cin_read_mode"] = 0
                    elif (i > 0 and len(partition[i - 1]) > 1):
                        if self.params["use_uram_all"]:
                            search_task.configs["cin_read_mode"] = 2
                        else:
                            search_task.configs["cin_read_mode"] = 0
                    else:
                        # Access on-chip streaming buffers
                        search_task.configs["cin_read_mode"] = 2
                    if i == len(partition) - 1:
                        # Write to DRAM
                        search_task.configs["cout_write_mode"] = 0
                    elif (i < len(partition) - 1 and len(partition[i + 1]) > 1):
                        if self.params["use_uram_all"]:
                            search_task.configs["cout_write_mode"] = 1
                        else:
                            search_task.configs["cout_write_mode"] = 0
                    else:
                        search_task.configs["cout_write_mode"] = 1
                    # Run it for multiple times
                    for repeat in range(1):
                        job_list.append(
                            {
                                "job_hash": f"{str(search_task)}_{repeat}",
                                "func": explorer_tmp.tune,
                                "args": [search_task, None, self.sub_task_silent, 1]
                            })
                    array_tasks.append(search_task)
                tasks.append(array_tasks)

            pool = utils.MyExecutor(self.n_worker)
            results = pool.exec(job_list)

            idx = 0
            for i in single_task_arrays:
                if i in skip_idx:
                    continue
                history_local = history[i]
                array_tasks = tasks[idx]
                for task in array_tasks:
                    for result in results:
                        if result.startswith(str(task)):
                            record = results[result]
                            if record.valid:
                                record.arch_sol = record.task_sols[0]
                                history_local.append(record)
                                self.search_cache[i].append(record)
                if len(history_local) == 0:
                    return legal_records
                def take_latency(record):
                    return record.latency
                history_local.sort(key=take_latency)
                # Only take up to 2 designs for scalability issues
                history_local = history_local[:min(len(history_local), history_thres)]
                history[i] = history_local
                idx += 1            

            # multi-task array
            for i in multi_task_arrays:
                # Update the history
                history_tmp = []
                for record in history[i]:
                    if record.cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       record.cst["DSP"] <= resource_alloc["DSP"][i]:
                       history_tmp.append(record)
                if legal_records and is_first:
                    if legal_records[i].cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       legal_records[i].cst["DSP"] <= resource_alloc["DSP"][i]:
                        history_tmp.append(legal_records[i])
                        self.search_cache[i].append(legal_records[i])
                history[i] = history_tmp
                if skip_search == 1:
                    if (i not in resource_alloc["slow_idx"]) and (i not in resource_alloc["fast_idx"]) and len(history[i]) > 0:
                        continue
                explorer_tmp = copy.deepcopy(self.params["explorer"])
                # Update the constraints
                explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                explorer_tmp.cst.hw_cst["BRAM18K"] = resource_alloc["BRAM18K"][i]
                early_stop = -1
                if self.params["use_uram_all"]:
                    search_task_configs = {}
                    for task_idx in range(len(partition[i])):
                        search_task_configs[task_idx] = {'cin_read_mode': 2, 'cout_write_mode': 1}
                    if i == 0:
                        search_task_configs[0]["cin_read_mode"] = 0
                    if i == n_arrays - 1:
                        search_task_configs[len(partition[i]) - 1]["cout_write_mode"] = 0
                else:
                    search_task_configs = None
                job_list = []
                for design_idx in self.params["design_idx_list"]:
                    # Parallel version
                    job_list.append(
                        {
                            "job_hash": f"{design_idx}",
                            "func": explorer_tmp.search_non_fusion_single_acc_customized1,
                            "args": [design_idx, search_task_configs, -1, self.sub_task_silent, partition[i], None, True]
                        })
                    # Sequential version
                    #search_record = explorer_tmp.search_non_fusion_single_acc_customized1(\
                    #    design_idx=design_idx, silent=self.sub_task_silent, \
                    #    workload_idx=partition[i], early_stop=early_stop, one_gen=True)
                    #if search_record.valid:
                    #    early_stop = search_record.latency
                    #    history[i].append(search_record)
                    #    self.search_cache[i].append(search_record)
                pool = utils.MyExecutor(max(int(self.n_worker/8), 4))
                results = pool.exec(job_list)
                for design_idx in self.params["design_idx_list"]:
                    search_record = results[f"{design_idx}"]
                    if search_record.valid:
                        history[i].append(search_record)
                        self.search_cache[i].append(search_record)
                def take_latency(record):
                    return record.latency
                history[i].sort(key=take_latency)
                history[i] = history[i][:min(len(history[i]), history_thres)]            

            # Find the array combination that satisfies the memory usage
            choices_tmp = [list(range(len(h))) for h in history]
            choices = list(itertools.product(*choices_tmp))
            max_bram_tmp = 0
            min_bram_tmp = float("inf")
            best_throughput_tmp = 0
            for choice in choices:
                records_tmp = []
                for i in range(n_arrays):
                    records_tmp.append(history[i][choice[i]])
                latency, throughput, _ = self.est_latency(partition, records_tmp)                
                if not self.overuse_resource(partition, records_tmp):
                    if throughput > best_throughput_tmp:
                        records = records_tmp
                        best_throughput_tmp = throughput

            # Search for several designs with fewer resource for tuning
            if records and fine_tune == 1:
                # single-task array
                max_attempt = 3
                n_attempt_list = [max_attempt for i in range(n_arrays)]
                for i in multi_task_arrays:
                    n_attempt_list[i] = 0
                last_record = [None for i in range(n_arrays)]
                while any(y > 0 for y in n_attempt_list):
                    job_list = []
                    skip_idx = []
                    tasks = []
                    for i in single_task_arrays:
                        if i not in resource_alloc["fast_idx"]:
                            skip_idx.append(i)
                            n_attempt_list[i] = 0
                            continue
                        if int(resource_alloc["BRAM18K"][i]) in self.search_cache_cst[i]:
                            # Candidate search has been done for this one before
                            skip_idx.append(i)
                            n_attempt_list[i] = 0
                            continue
                        array_tasks = []
                        unit_dec_bram = 4 # Decrease by 4 each time
                        if last_record[i]:
                            dec_bram = records[i].cst["BRAM18K"]- last_record[i].cst["BRAM18K"] + unit_dec_bram
                        else:
                            dec_bram = unit_dec_bram
                        slow_idx_list = resource_alloc["slow_idx"]
                        fast_idx_list = resource_alloc["fast_idx"]
                        ub_latency = (records[slow_idx_list[0]].latency - records[i].latency) / (len(fast_idx_list) + 1) + records[i].latency
                        n_attempt = n_attempt_list[i]
                        if n_attempt == max_attempt:
                            self.search_cache_cst[i].append(int(resource_alloc["BRAM18K"][i]))
                        if n_attempt > 0:
                            explorer_tmp = copy.deepcopy(self.params["explorer"])
                            explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                            explorer_tmp.cst.hw_cst["BRAM18K"] = records[i].cst["BRAM18K"] - dec_bram
                            for design_idx in self.params["design_idx_list"]:
                                design = explorer_tmp.designs[design_idx]
                                if design.name == records[i].design:
                                    cur_design_idx = design_idx
                            search_record = None
                            for r_c in self.search_cache[i]:
                                if r_c.cst["BRAM18K"] == explorer_tmp.cst.hw_cst["BRAM18K"] and \
                                   r_c.cst["DSP"] == explorer_tmp.cst.hw_cst["DSP"] and \
                                   r_c.design == explorer_tmp.designs[cur_design_idx].name:
                                    search_record = r_c
                                    last_record[i] = search_record
                                    break
                            if not search_record:
                                search_task = SingleTask(explorer_tmp.designs[cur_design_idx], self.search_task.tasks[partition[i][0]].workload, explorer_tmp.cst)
                                # Update the task configs
                                if i == 0:
                                    # Load from DRAM
                                    search_task.configs["cin_read_mode"] = 0
                                elif (i > 0 and len(partition[i - 1]) > 1):
                                    if self.params["use_uram_all"]:
                                        search_task.configs["cin_read_mode"] = 2
                                    else:
                                        search_task.configs["cin_read_mode"] = 0
                                else:
                                    # Access on-chip streaming buffers
                                    search_task.configs["cin_read_mode"] = 2
                                if i == len(partition) - 1:
                                    # Write to DRAM
                                    search_task.configs["cout_write_mode"] = 0
                                elif (i < len(partition) - 1 and len(partition[i + 1]) > 1):
                                    if self.params["use_uram_all"]:
                                        search_task.configs["cout_write_mode"] = 1
                                    else:
                                        search_task.configs["cout_write_mode"] = 0
                                else:
                                    search_task.configs["cout_write_mode"] = 1
                                for repeat in range(1):
                                    job_list.append(
                                    {
                                        "job_hash": f"{str(search_task)}_{repeat}",
                                        "func": explorer_tmp.tune,
                                        "args": [search_task, None, self.sub_task_silent, 0]
                                    })
                                array_tasks.append(search_task)
                        tasks.append(array_tasks)

                    pool = utils.MyExecutor(self.n_worker)
                    results = pool.exec(job_list)

                    idx = 0
                    for i in single_task_arrays:
                        if i in skip_idx:
                            continue
                        array_tasks = tasks[idx]
                        no_valid_record = True
                        for task in array_tasks:
                            for result in results:
                                if result.startswith(str(task)):
                                    record = results[result]
                                    if record.valid:
                                        record.arch_sol = record.task_sols[0]
                                        self.search_cache[i].append(record)
                                        last_record[i] = record
                                        no_valid_record = False

                        idx += 1
                        if no_valid_record:
                            n_attempt_list[i] = 0
                        else:
                            n_attempt_list[i] -= 1

                # multi-task array
                for i in multi_task_arrays:
                    if i not in resource_alloc["fast_idx"]:
                        continue
                    if int(resource_alloc["BRAM18K"][i]) in self.search_cache_cst[i]:
                        continue
                    unit_dec_bram = 16 # Start with 16
                    dec_bram = unit_dec_bram
                    slow_idx_list = resource_alloc["slow_idx"]
                    fast_idx_list = resource_alloc["fast_idx"]
                    ub_latency = (records[slow_idx_list[0]].latency - records[i].latency) / (len(fast_idx_list) + 1) + records[i].latency
                    n_attempt = 2 # Search two designs for multi-task array
                    while n_attempt > 0:
                        explorer_tmp = copy.deepcopy(self.params["explorer"])
                        explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                        explorer_tmp.cst.hw_cst["BRAM18K"] = records[i].cst["BRAM18K"] - dec_bram
                        for design_idx in self.params["design_idx_list"]:
                            design = explorer_tmp.designs[design_idx]
                            if design.name == records[i].design:
                                cur_design_idx = design_idx
                        search_record = None
                        for r_c in self.search_cache[i]:
                            if r_c.cst["BRAM18K"] == explorer_tmp.cst.hw_cst["BRAM18K"] and \
                               r_c.cst["DSP"] == explorer_tmp.cst.hw_cst["DSP"] and \
                               r_c.design == explorer_tmp.designs[cur_design_idx].name:
                                search_record = r_c
                                break
                        if not search_record:
                            if self.params["use_uram_all"]:
                                search_task_configs = {}
                                for task_idx in range(len(partition[i])):
                                    search_task_configs[task_idx] = {'cin_read_mode': 2, 'cout_write_mode': 1}
                                if i == 0:
                                    search_task_configs[0]["cin_read_mode"] = 0
                                if i == n_arrays - 1:
                                    search_task_configs[len(partition[i]) - 1]["cout_write_mode"] = 0
                            else:
                                search_task_configs = None
                            search_record = explorer_tmp.search_non_fusion_single_acc_customized1(\
                                design_idx=cur_design_idx, search_task_configs=search_task_configs, \
                                silent=self.sub_task_silent, \
                                workload_idx=partition[i], one_gen=True)
                            if search_record.valid:
                                self.search_cache[i].append(search_record)
                        if search_record.valid:
                            if n_attempt == 2 and search_record.latency > ub_latency:
                                unit_dec_bram = 4
                                dec_bram = unit_dec_bram
                            else:
                                dec_bram = records[i].cst["BRAM18K"]- search_record.cst["BRAM18K"] + unit_dec_bram
                        else:
                            break
                        n_attempt -= 1
                    self.search_cache_cst[i].append(int(resource_alloc["BRAM18K"][i]))

            is_first = False
            if fine_tune:
                skip_search = 1
                if len(records) == 0:
                    if not adjust_func(partition, resource_alloc, legal_records, 1):
                        break
                else:
                    if best_throughput_tmp > best_throughput:
                        legal_records = copy.deepcopy(records)
                        best_throughput = best_throughput_tmp

                    old_slow_idx = resource_alloc["slow_idx"][0]
                    old_slow_record_latency = resource_alloc["array_latency"][old_slow_idx]
                    slow, fast = self.update_bottleneck_idx(records)
                    resource_alloc["slow_idx"] = slow
                    resource_alloc["fast_idx"] = fast
                    resource_alloc["array_latency"] = [record.latency for record in records]
                    
                    # For internal testing
                    #print("****************** Tuning ******************")
                    #latency_list = [r.latency for r in records]
                    #dsp_list = [r.cst["DSP"] for r in records]
                    #bram_list = [r.cst["BRAM18K"] for r in records]
                    #dsp_eff_list = [r.dsp_eff for r in records]
                    #print("max latency: ", 1 / best_throughput_tmp)
                    #print("latency list: ", latency_list)
                    #print("bram list: ", bram_list)
                    #print("dsp list: ", dsp_list)
                    #print("dsp eff: ", dsp_eff_list)
                    #print("****************** Tuning ******************")

                    if resource_alloc["slow_idx"][0] == old_slow_idx:
                        if records[i].latency <= old_slow_record_latency:
                            break
                        if not adjust_func(partition, resource_alloc, records, 0):
                            break
                    else:
                        break
            else:
                if len(records) == 0:
                    resource_alloc["BRAM18K"] = [n / 2 for n in resource_alloc["BRAM18K"]]
                    #resource_alloc["DSP"] = [n / 2 for n in resource_alloc["DSP"]]
                else:
                    legal_records = records
                    break

        return legal_records

    def search_design(self, partition_idx):
        partition_idx = int(partition_idx)
        if partition_idx in self.bay_search_log:
            return self.bay_search_log[partition_idx]
        #print(len(self.params['partition_candidates']))
        #print(partition_idx)
        self.log(f"Partition {partition_idx}: {self.params['partition_candidates'][partition_idx]['partition']}")
        rewards_window = []
        self.counter.init_counter('local_time')
        local_best_reward = 0
        partition = self.params['partition_candidates'][partition_idx]['partition']
        n_arrays = len(partition)
         # Store all the search records for each array
        for i in range(n_arrays):
            self.search_cache[i] = []
        # Store the resource constraint used for each search to avoid redundant search
        for i in range(n_arrays):
            self.search_cache_cst[i] = []

        # Initialize resource allocation
        resource_alloc = self.resource_alloc(partition)

        # Find a legal config
        records = self.find_legal_config(partition, resource_alloc, skip_search=0)
        if records:
            self.local_epoch = 0
            self.last_update_epoch = 0
            last_slow_idx = -1
            while True:
                latency, used_constraints, throughput, meta = self.evaluate(partition, records)
                dsp_eff = self.est_dsp_eff(throughput, used_constraints)
                reward = throughput
                search_record = utils.SearchRecord().extract_from_tuner_multi_acc(records, reward, latency, used_constraints, throughput, dsp_eff, partition=partition)
                # Update global reward
                if reward > self.best_reward:
                    self.best_reward = reward
                    self.best_search_record = search_record
                    self.log(f'Global Epoch {self.epoch} - Partition {partition_idx} - #Array {n_arrays}: new global best reward: {self.best_reward} (latency: {latency:.0f}, throughput: {throughput}, DSP eff: {dsp_eff:.2f}, BRAM: {used_constraints["BRAM18K"]:.2f}, DSP: {used_constraints["DSP"]:.2f}, URAM: {used_constraints["URAM"]:.2f}, BW: {search_record.bw:.2f})')
                self.best_rewards.append(self.best_reward)
                self.counter.update_counter('time')
                self.best_rewards_time.append(self.counter.get_counter('time'))
                # Update local reward
                if reward > local_best_reward:
                    local_best_reward = reward
                    self.log(f'Local Epoch {self.local_epoch} - Partition {partition_idx} - #Array {n_arrays}: new local best reward: {self.best_reward} (latency: {latency:.0f}, throughput: {throughput}, DSP eff: {dsp_eff:.2f}, BRAM: {used_constraints["BRAM18K"]:.2f}, DSP: {used_constraints["DSP"]:.2f}, URAM: {used_constraints["URAM"]:.2f}, BW: {search_record.bw:.2f})')
                    self.last_update_epoch = self.local_epoch
                rewards_window.append(reward)

                if len(rewards_window) > self.params["max_trial"]:
                    stdev_percent = np.std(rewards_window[-3:]) / np.mean(rewards_window[-3:])
                    if stdev_percent < self.params["reward_stdev_thres"]:
                        self.log(f'Minimal improvement after {self.params["max_trial"]} rounds, terminated')
                        break
                if self.local_epoch - self.last_update_epoch > self.params["max_trial"]:
                    self.log(f'No improvement after {self.params["max_trial"]} rounds, terminated')
                    break
                # If the tuning time is too long, kill it
                self.counter.update_counter('local_time')
                if self.counter.get_counter("local_time") > self.max_time:
                    self.log('Time out, terminated')
                    break

                # Fine-tuning
                if self.is_finetune_required(records, dsp_eff):
                    # Find fastest/slowest design index
                    slow, fast = self.update_bottleneck_idx(records)
                    # Update resource alloc to reflect the current usage
                    for i in range(len(records)):
                        resource_alloc['DSP'][i] = np.ceil(records[i].cst['DSP'])
                        resource_alloc['BRAM18K'][i] = np.ceil(records[i].cst['BRAM18K'])
                    # Adjust resource alloc
                    resource_alloc["init"] = {"DSP": copy.deepcopy(resource_alloc['DSP']),
                                              "BRAM18K": copy.deepcopy(resource_alloc['BRAM18K']),
                                              "DSP_total": used_constraints['DSP'],
                                              "BRAM18K_total": used_constraints['BRAM18K']}
                    resource_alloc["array_latency"] = [record.latency for record in records]
                    resource_alloc["state"] = 0
                    if slow[0] == last_slow_idx:
                        resource_alloc["state"] = 1
                    resource_alloc["slow_idx"] = slow
                    resource_alloc["fast_idx"] = fast
                    resource_alloc["step"] = [[0, 1], [0.025]] # step for resource adjustment
                    resource_alloc["n_adjust"] = [0, 0] # number of attempts at each state
                    resource_alloc["decrease"] = [-1, -1] # indicate if the allocation of bram decreases in the previous round
                    resource_alloc["history"] = [0, 0] # bram allocation in the last round
                    last_slow_idx = slow[0]
                    if not self.resource_alloc_adjust(partition, resource_alloc, records, 0):
                        self.log('No valid resource allocation found, terminated')
                        break
                    records = self.find_legal_config(partition, resource_alloc, old_records=records, adjust_func=self.resource_alloc_adjust, fine_tune=1, skip_search=0)
                    if not records:
                        self.log('No valid records found, terminated')
                        break
                else:
                    self.log('Fine-tuning not required, terminated')
                    break

                self.epoch += 1
                self.local_epoch += 1

        self.bay_search_log[partition_idx] = local_best_reward
        self.bayopt_epoch += 1
        self.bayopt_best_rewards.append(self.best_reward)
        self.counter.update_counter('time')
        self.bayopt_best_rewards_time.append(self.counter.get_counter('time'))
        return local_best_reward

    def search(self):
        self.n_layers = len(self.search_task.tasks)
        if self.n_layers < 2:
            raise RuntimeError("Multi-acc exploration requires at least two conv layers.")
        self.counter.init_counter('time')
        # Bayesian Tuner
        pbounds = {'partition_idx': (0, len(self.params["partition_candidates"]) - 1)} # Right included

        bay_tuner = BayesianOptimization(
            f=self.search_design,
            pbounds=pbounds,
            random_state=1
        )
        for probe_idx in self.params['probe_points']:
            bay_tuner.probe(
                params=[probe_idx],
                lazy=True
            )
        bay_tuner.maximize(
            init_points=0,
            n_iter=10
        )

def multi_acc_search2(search_task, init_tasks, cst, search_obj, max_epochs, max_time, \
                      n_worker=1, silent=0, population_size=20, policy=0, meta=None, explorer=None, profiling=0):
    """ This function finds the best multi-array architecture for a list of tasks.
    The key difference compared to multi_acc_search2 is that in multi_acc_search2,
    we restrain the resource for each array and search the best config for each one.
    However, in multi_acc_search2, we search the array in sequence, when searching the
    next array, we will take into account the previous one, and penalize the configs
    resulting in longer overall latency (setup + array latency).
    """
    import logging
    logger = logging.getLogger('AutoSA-Tuner')
    if silent == 0:
        logger.info("Performing cross layer multi-accelerator genetic search...")

    best_latency = utils.compute_tasks_latency(search_task.tasks, init_tasks)
    if silent == 0:
        logger.info(f'Cross-layer multi-accelerator ideal latency: {best_latency}')

    partition_candidates = meta["partition_candidates"]

    tuner_params = {
        "explorer": explorer,
        "probe_points": meta["init_partition_candidates"],        
        "best_reward": 1 / best_latency if best_latency else None,
        "partition_candidates": partition_candidates,
        "batch_size": meta["batch_size"],        
        "dsp_eff_thres": 0.85, # If the DSP eff is greater than this thres, no fine-tuning is required.
        "latency_stdev_thres": 0.03,
        "reward_stdev_thres": 0.025,
        "max_trial": 3 # Terminate fine-tuning after more than x trials
    }
    if meta:
        tuner_params["design_idx_list"] = meta['design_idx_list']

    if max_epochs > 0:
        pass
    else:
        max_time = 1800 # 30 minutes at most

    tuner = MultiAccTuner2(search_task, cst, search_obj, max_epochs, max_time, tuner_params, n_worker, silent)
    tuner.search()

    search_record = tuner.best_search_record
    
    now = datetime.now()
    config_str = f"_{explorer.search_config['workload']}_multi2"        
    with open(f'tmp/tuning_rewards{config_str}.csv', "w", newline='') as f:
        fieldnames = ['epoch', 'reward', 'time']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for epoch in range(len(tuner.best_rewards)):
            writer.writerow({'epoch': epoch, 'reward': tuner.best_rewards[epoch], 'time': tuner.best_rewards_time[epoch]})

    return search_record

class MultiAccTuner2(MultiAccTuner1):
    def __init__(self, search_task, cst, obj, max_epoch, max_time, params, n_worker=1, silent=0):
        super().__init__(search_task, cst, obj, max_epoch, max_time, params, n_worker=n_worker, silent=silent)

    def est_mem(self, partition, records, verbose=0):
        """ Estimate the total memory usage.
        BRAM18K is consumed by two parts: arrays and streaming buffers in-between.
        For two adjacent arrays, suppose their tiling factors as:
        [tr1, tc1, to1, ti1] and [tr2, tc2, to2, ti2]
        Compute the tiling factors such that:
        tr' = c0 * tr1
        tc' = c1 * tc1
        (c0 - 1) * tr1 < tr2 + k - 1 <= c0 * tr1
        (c1 - 1) * tc1 < tc2 + k - 1 <= c1 * tc1
        Streaming buffers are allocated to hold at least:
        tr' * tc' * o1(i2) * 2
        such that when the second array is using the first block of (tr2 + k - 1) * ... * i2,
        the first array will continue to fill the rest of the buffer for the next round.
        If verbose is set to 1, return the detailed resource usage of each array and streaming buffer.

        Streaming buffers are mapped to URAMs for this architecture.
        """
        array_bufs = []
        stream_bufs = [0 for i in range(len(records))]
        BRAM18K_total = 0
        URAM_total = 0
        # array bufs
        for i in range(len(records)):
            record = records[i]
            BRAM18K_total += record.cst["BRAM18K"]
            array_bufs.append(record.cst["BRAM18K"])

        # streaming buffers
        for round in range(len(partition[0])):
            for i in range(1, len(records)):
                if round >= len(partition[i - 1]):
                    continue
                layer1_idx = partition[i - 1][round]
                if round >= len(partition[i]):
                    continue
                layer2_idx = partition[i][round]
                array1 = records[i - 1].task_sols[round]
                array2 = records[i].task_sols[round]
                # Extract parameters of array 1
                o1 = self.search_task.tasks[layer1_idx].workload['params']['o']
                tr1 = min(array1['sol']['r_t1'], self.search_task.tasks[layer1_idx].workload['params']['r'])
                tc1 = min(array1['sol']['c_t1'], self.search_task.tasks[layer1_idx].workload['params']['c'])
                for tag in self.search_task.tasks[layer1_idx].workload['tags']:
                    if tag.startswith('maxpool'):
                        stride = int(tag.split('_')[-1])
                        tr1 /= stride
                        tc1 /= stride
                tr1 = max(int(tr1), 1)
                tc1 = max(int(tc1), 1)
                # Extract parameters of array 2
                tr2 = min(array2['sol']['r_t1'], self.search_task.tasks[layer2_idx].workload['params']['r'])
                tc2 = min(array2['sol']['c_t1'], self.search_task.tasks[layer2_idx].workload['params']['c'])
                k = self.search_task.tasks[layer2_idx].workload['params']['p']
                data_pack = array2['sol']['i_t2']
                # Compute the BRAM size
                c0 = np.ceil((tr2 + k - 1) / tr1)
                c1 = np.ceil((tc2 + k - 1) / tc1)
                array1_params = self.search_task.tasks[layer1_idx].workload["params"]
                array2_params = self.search_task.tasks[layer2_idx].workload["params"]
                trp = min(c0 * tr1, array1_params['r'])
                tcp = min(c1 * tc1, array1_params['c'])
                #ele_num = trp * tcp * o1 * 2
                ele_num = min(trp * array1_params['c'] * o1, tcp * array1_params['r'] * o1)

                #buffer = np.ceil(self.search_task.dw * data_pack * 8 / 36) * np.ceil(ele_num / data_pack / 512)
                buffer = np.ceil(self.search_task.dw * data_pack * 8 / 72) * np.ceil(ele_num / data_pack / 4096)

                #print(array1['sol'])
                #print(array2['sol'])
                #print(c0, c1, tr1, tc1, trp, tcp, o1)
                #print(i, data_pack, ele_num, buffer)
                stream_bufs[i] = max(stream_bufs[i], buffer)

        #BRAM18K_total += np.sum(stream_bufs)
        URAM_total = np.sum(stream_bufs)
        if verbose == 0:
            return {"BRAM18K": BRAM18K_total, "URAM": URAM_total}, None
        else:
            return {"BRAM18K": BRAM18K_total, "URAM": URAM_total}, {"array_bufs": array_bufs, "stream_bufs": stream_bufs}

    #def overuse_resource(self, partition, records):
    #    for record in records:
    #        if record.valid == 0:
    #            return True
    #    mem, meta = self.est_mem(partition, records)
    #    DSP = 0
    #    for record in records:
    #        DSP += record.cst["DSP"]
    #    BRAM18K = mem["BRAM18K"]
    #    URAM = mem["URAM"]
    #    if BRAM18K > self.cst.hw_cst["BRAM18K"]:
    #        return True
    #    if URAM > self.cst.hw_cst["URAM"]:
    #        return True
    #    if DSP > self.cst.hw_cst["DSP"]:
    #        return True
#
    #    return False

    #def est_resource(self, partition, records):
    #    mem, meta = self.est_mem(partition, records)
    #    DSP = 0
    #    for record in records:
    #        DSP += record.cst["DSP"]
#
    #    return {"DSP": DSP, "BRAM18K": mem["BRAM18K"], "URAM": mem["URAM"]}

    def est_latency(self, partition, records, in_place=0, adjust=0, verbose=0):
        """ Compute the latency of the design.
        The execution model is that at each round, each array will execute the layer at the head of
        its partition list. Between arrays, there are streaming buffers that make sure the computation
        gets started as soon as the data are available from the previous array.
        Until all the arrays finish their tasks, we will start the next round.
        If in_place is set to 1, records latency will be updated.
        If adjust is set to 1, we will consider the possible stall between arrays.
        """
        record_latency = []
        for r in records:
            tmp_latency = [s["latency"] for s in r.task_sols]
            record_latency.append(tmp_latency)

        total_latency = [0 for i in range(len(records))]
        for latency in record_latency[0]:
            total_latency[0] += (latency * self.params["batch_size"])
        # Store the setup/array latency of each array at each round
        round_info = []

        design_latency = 0
        for round in range(len(partition[0])):
            setup_latency = [0]
            array_latency = [record_latency[0][round] * self.params["batch_size"]]

            # Update the array and setup latency
            for i in range(1, len(records)):
                if round >= len(partition[i - 1]):
                    continue
                layer1_idx = partition[i - 1][round]
                if round >= len(partition[i]):
                    continue
                layer2_idx = partition[i][round]
                array1 = records[i - 1].task_sols[round]
                array2 = records[i].task_sols[round]
                # Extract the parameters of array 1
                o1 = self.search_task.tasks[layer1_idx].workload['params']['o']
                tr1 = min(array1['sol']['r_t1'], self.search_task.tasks[layer1_idx].workload['params']['r'])
                tc1 = min(array1['sol']['c_t1'], self.search_task.tasks[layer1_idx].workload['params']['c'])
                tr1_post = tr1
                tc1_post = tc1
                for tag in self.search_task.tasks[layer1_idx].workload['tags']:
                    if tag.startswith('maxpool'):
                        stride = int(tag.split('_')[-1])
                        tr1_post /= stride
                        tc1_post /= stride
                tr1_post = max(int(tr1_post), 1)
                tc1_post = max(int(tc1_post), 1)
                # Extract parameters of array 2
                tr2 = min(array2['sol']['r_t1'], self.search_task.tasks[layer2_idx].workload['params']['r'])
                tc2 = min(array2['sol']['c_t1'], self.search_task.tasks[layer2_idx].workload['params']['c'])
                k = self.search_task.tasks[layer2_idx].workload['params']['p']
                data_pack = array2['sol']['i_t2']

                c0 = np.ceil((tr2 + k - 1) / tr1_post)
                c1 = np.ceil((tc2 + k - 1) / tc1_post)
                array1_params = self.search_task.tasks[layer1_idx].workload["params"]
                array2_params = self.search_task.tasks[layer2_idx].workload["params"]
                trp = min(c0 * tr1, array1_params['r'])
                tcp = min(c1 * tc1, array1_params['c'])
                # Set up latency
                #if (array1['sol']['r_t1'] == array2['sol']['r_t1']) and \
                #   (array1['sol']['c_t1'] == array2['sol']['c_t1']):
                #    tri = np.ceil(array2['sol']['i_t1'] / array1['sol']['o_t1']) * array1['sol']['o_t1']
                #    setup = array_latency[-1] / (np.ceil(array1_params['o'] / tri))
                #else:
                #setup = record_latency[i - 1][round] / (np.ceil(array1_params['r'] / trp) * np.ceil(array1_params['c'] / tcp))
                if trp > tcp:
                    setup = record_latency[i - 1][round] / np.ceil(array1_params['c'] / tcp)
                else:
                    setup = record_latency[i - 1][round] / np.ceil(array1_params['r'] / trp)
                #setup = 0
                setup_latency.append(setup)

                # Adjust the array latency
                if adjust:
                    raise RuntimeError("Array latency adjust for multi-array is not implemented.")
                    """
                    # Consider the fine-grained produce-consume relationship
                    n_fill_rounds = np.ceil((min(2 * tr2 + k - 1, array1_params['r'] + k - 1) - c0 * tr1_post) / tr1_post) * c1
                    fill_latency = array_latency[-1] / (np.ceil(array1_params['r'] / tr1 * np.ceil(array1_params['c'] / tc1))) * n_fill_rounds
                    consume_latency = record_latency[i][round] / (np.ceil(array2_params['r'] / tr2 * np.ceil(array2_params['c'] / tc2)))
                    adjusted_latency = max(fill_latency, consume_latency) * np.ceil(array2_params['r'] / tr2) * np.ceil(array2_params['c'] / tc2)
                    record_latency[i][round] = adjusted_latency
                    array_latency.append(adjusted_latency)
                    """
                else:
                    # Simply compute the max
                    array_latency.append(max(record_latency[i][round] * self.params["batch_size"], array_latency[i - 1]))

                for prev_i in range(i + 1):
                    total_latency[i] += setup_latency[prev_i]
                total_latency[i] += array_latency[i]

            local_round_latency = 0
            for lat in setup_latency:
                local_round_latency += lat
            local_round_latency += array_latency[-1]
            design_latency += local_round_latency
            
            total_off_chip_trans = 0
            for i in range(len(records)):
                if round >= len(partition[i]):
                    break
                off_chip_acc_num_meta = records[i].task_sols[round]["reward_meta"]["activity"]["off_chip_acc_num_meta"]
                cin_trans = 0
                w_trans = 0
                cout_trans = 0
                for module in off_chip_acc_num_meta:
                    if module.startswith("cin"):
                        cin_trans = off_chip_acc_num_meta[module]
                    if module.startswith("w"):
                        w_trans = off_chip_acc_num_meta[module]
                    if module.startswith("cout"):
                        cout_trans = off_chip_acc_num_meta[module]
                if i in range(1, len(records)):
                    cin_trans = 0
                if i in range(len(records) - 1):
                    cout_trans = 0
                total_off_chip_trans += (cin_trans + w_trans + cout_trans)

            round_info.append({"latency": local_round_latency, "setup": setup_latency, 
                               "total_off_chip_trans": total_off_chip_trans,
                               "array": [], "sol": [], "params": []})
            for i in range(len(records)):
                if round >= len(partition[i]):
                    continue
                round_info[-1]["array"].append(records[i].task_sols[round]["latency"])
                #round_info[-1]["sol"].append(records[i].task_sols[round]["sol"])
                #round_info[-1]["params"].append(self.search_task.tasks[partition[i][round]].workload["params"])

        if in_place:
            for i in range(1, len(records)):
                records[i].latency = total_latency[i]

        # Throughput
        throughput = 1 / design_latency * self.params["batch_size"]

        if verbose == 1:
            return design_latency, throughput, {"total_latency": total_latency, "round_info": round_info}
        else:
            return design_latency, throughput, {"total_latency": total_latency, "round_info": round_info}

    #def evaluate(self, partition, records, verbose=0):
    #    latency, throughput, meta = self.est_latency(partition, records, verbose=verbose)
    #    resource = self.est_resource(partition, records)
    #    return latency, resource, throughput, meta

    def find_legal_config(self, partition, resource_alloc, old_records=None, adjust_func=None, fine_tune=0, skip_search=1):
        legal_records = old_records
        best_throughput = 0
        is_first = True
        n_arrays = len(partition)
        # Maintain a list of several best designs for each array
        history = [[] for i in range(n_arrays)]
        history_thres = 2
        #history_thres = 1
        if n_arrays > 10:
            # Aovid storing too many configs
            history_thres = 1

        while True:
            ## For internal testing
            #print("****************** Allocation ******************")
            #pprint.pprint(resource_alloc)
            #print("****************** Allocation ******************")
            #start  = time.time()

            records = []
            skip_idx = []
            job_list = []
            tasks = []
            for i in range(n_arrays):
                # Update the history
                history_tmp = []
                for record in history[i]:
                    if record.cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       record.cst["DSP"] <= resource_alloc["DSP"][i]:
                       history_tmp.append(record)
                if legal_records and is_first:
                    if legal_records[i].cst["BRAM18K"] <= resource_alloc["BRAM18K"][i] and \
                       legal_records[i].cst["DSP"] <= resource_alloc["DSP"][i]:
                       history_tmp.append(legal_records[i])
                       self.search_cache[i].append(legal_records[i])
                history[i] = history_tmp
                if skip_search == 1:
                    if (i in resource_alloc["fast_idx"] and len(history[i]) > 0) or \
                       ((i not in resource_alloc["fast_idx"]) and (i not in resource_alloc["slow_idx"]) and len(history[i]) > 0):
                    #if ((i not in resource_alloc["slow_idx"]) and (i not in resource_alloc["fast_idx"])) or len(history[i]) > 0:
                        #if resource_alloc["state"] == 0:
                        #    if i < min(resource_alloc["slow_idx"]):
                        #        skip_idx.append(i)
                        #        continue
                        #elif resource_alloc["state"] == 1:
                        #    if i < min(resource_alloc["slow_idx"]) and i < min(resource_alloc["fast_idx"]):
                        #        skip_idx.append(i)
                        #        continue

                        skip_idx.append(i)
                        continue

                #print("skipped: ", skip_idx)
                #job_list = []
                for design_idx in self.params["design_idx_list"]:
                    # Submit the search job
                    #local_start = time.time()
                    explorer_tmp = copy.deepcopy(self.params["explorer"])
                    #local_end = time.time()
                    #print("copy time: ", local_end - local_start)
                    # Update the constraints
                    explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                    explorer_tmp.cst.hw_cst["BRAM18K"] = resource_alloc["BRAM18K"][i]
                    early_stop = -1
                    search_task_configs = {}
                    for task_idx in range(len(partition[i])):
                        search_task_configs[task_idx] = {'cin_read_mode': 2, 'cout_write_mode': 1}
                    if i == 0:
                        for task_idx in range(len(partition[i])):
                            # Load from DRAM
                            search_task_configs[task_idx]['cin_read_mode'] = 0
                    if i == n_arrays - 1:
                        for task_idx in range(len(partition[i])):
                            # Write to DRAM
                            search_task_configs[task_idx]['cout_write_mode'] = 0

                    if i > 0 and len(history[i - 1]) > 0:
                        prev_array = {"record": history[i - 1][0], "workloads": partition[i - 1]}
                    else:
                        prev_array = None
                    # Parallel version
                    #print(f"{i}_{design_idx}")
                    job_list.append(
                        {
                            "job_hash": f"{i}_{design_idx}",
                            "func": explorer_tmp.search_non_fusion_single_acc_customized1,
                            "args": [design_idx, search_task_configs, -1, self.sub_task_silent, partition[i], None, True]
                            #"args": [design_idx, search_task_configs, -1, self.sub_task_silent, partition[i], prev_array, True]
                        }
                    )
                    # Sequential version
                    #search_record = explorer_tmp.search_non_fusion_single_acc_customized1(\
                    #    design_idx=design_idx, silent=self.sub_task_silent, \
                    #    workload_idx=partition[i], early_stop=early_stop, \
                    #    search_task_configs=search_task_configs, prev_array=prev_array, one_gen=True)
                    #if search_record.valid:
                    #    early_stop = search_record.latency
                    #    history[i].append(search_record)
                    #    self.search_cache[i].append(search_record)

            pool = utils.MyExecutor(max(int(self.n_worker/8), 8))
            results = pool.exec(job_list)
            for i in range(n_arrays):
                if i in skip_idx:
                    continue
                for design_idx in self.params["design_idx_list"]:
                    search_record = results[f"{i}_{design_idx}"]
                    if search_record.valid:
                        history[i].append(search_record)
                        self.search_cache[i].append(search_record)
                def take_latency(record):
                    return record.latency
                history[i].sort(key=take_latency)
                history[i] = history[i][:min(len(history[i]), history_thres)]

            #end = time.time()
            #print("eval time: ", end - start)
            #start  = time.time()

            # Find the array combination that satisfies the memory usage
            choices_tmp = [list(range(len(h))) for h in history]
            choices = list(itertools.product(*choices_tmp))
            #print("total choices: ", len(choices))
            max_bram_tmp = 0
            min_bram_tmp = float("inf")
            best_throughput_tmp = 0
            for choice in choices:
                records_tmp = []
                for i in range(n_arrays):
                    records_tmp.append(history[i][choice[i]])
                latency, throughput, _ = self.est_latency(partition, records_tmp)
                memory, meta = self.est_mem(partition, records_tmp, verbose=0)
                #print("array_bufs: ", meta["array_bufs"])
                #print("stream_bufs: ", meta["stream_bufs"])
                #if memory > max_bram_tmp:
                #    max_bram_tmp = memory["BRAM18K"]
                #if memory < min_bram_tmp:
                #    min_bram_tmp = memory["BRAM18K"]
                #if memory < self.cst.hw_cst["BRAM18K"]:
                if not self.overuse_resource(partition, records_tmp):
                    if throughput > best_throughput_tmp:
                        records = records_tmp
                        best_throughput_tmp = throughput

            #end = time.time()
            #print(f"select time: {end - start} (avg: {(end - start) / len(choices)})")
            #start  = time.time()

            # Search for several designs with fewer resource for tuning
            if records and fine_tune == 1:
                for i in range(n_arrays):
                    if i not in resource_alloc["fast_idx"]:
                        continue
                    if int(resource_alloc["BRAM18K"][i]) in self.search_cache_cst[i]:
                        continue
                    unit_dec_bram = 16 # Start with 16
                    dec_bram = unit_dec_bram
                    slow_idx_list = resource_alloc["slow_idx"]
                    fast_idx_list = resource_alloc["fast_idx"]
                    ub_latency = (records[slow_idx_list[0]].latency - records[i].latency) / (len(fast_idx_list) + 1) + records[i].latency
                    #print("cache ub latency: ", ub_latency)
                    n_attempt = 2
                    while n_attempt > 0:
                        explorer_tmp = copy.deepcopy(self.params["explorer"])
                        explorer_tmp.cst.hw_cst["DSP"] = resource_alloc["DSP"][i]
                        explorer_tmp.cst.hw_cst["BRAM18K"] = records[i].cst["BRAM18K"] - dec_bram
                        for design_idx in self.params["design_idx_list"]:
                            design = explorer_tmp.designs[design_idx]
                            if design.name == records[i].design:
                                cur_design_idx = design_idx
                        search_record = None
                        for r_c in self.search_cache[i]:
                            if r_c.cst["BRAM18K"] == explorer_tmp.cst.hw_cst["BRAM18K"] and \
                               r_c.cst["DSP"] == explorer_tmp.cst.hw_cst["DSP"] and \
                               r_c.design == explorer_tmp.designs[cur_design_idx].name:
                                search_record = r_c
                                break
                        if not search_record:
                            search_task_configs = {}
                            for task_idx in range(len(partition[i])):
                                search_task_configs[task_idx] = {'cin_read_mode': 2, 'cout_write_mode': 1}
                            if i == 0:
                                for task_idx in range(len(partition[i])):
                                    # Load from DRAM
                                    search_task_configs[task_idx]['cin_read_mode'] = 0
                            if i == n_arrays - 1:
                                for task_idx in range(len(partition[i])):
                                    # Write to DRAM
                                    search_task_configs[task_idx]['cout_write_mode'] = 0
                            if i > 0:
                                prev_array = {"record": records[i - 1], "workloads": partition[i - 1]}
                            else:
                                prev_array = None
                            search_record = explorer_tmp.search_non_fusion_single_acc_customized1(\
                                design_idx=cur_design_idx, silent=self.sub_task_silent, workload_idx=partition[i], \
                                #search_task_configs=search_task_configs, prev_array=prev_array, one_gen=True)
                                search_task_configs=search_task_configs, prev_array=None, one_gen=True)
                            if search_record.valid:
                                self.search_cache[i].append(search_record)
                        if search_record.valid:
                            #print("cache searching: ", search_record.cst["BRAM18K"], search_record.latency)
                            if n_attempt == 2 and search_record.latency > ub_latency:
                                unit_dec_bram = 4
                                dec_bram = unit_dec_bram
                            else:
                                dec_bram = records[i].cst["BRAM18K"]- search_record.cst["BRAM18K"] + unit_dec_bram
                        else:
                            break
                        n_attempt -= 1
                    self.search_cache_cst[i].append(int(resource_alloc["BRAM18K"][i]))

            #end = time.time()
            #print("cache time: ", end - start)

            is_first = False
            if fine_tune:
                skip_search = 1
                if len(records) == 0:
                    if not adjust_func(partition, resource_alloc, legal_records, 1):
                        break
                else:
                    if best_throughput_tmp > best_throughput:
                        legal_records = copy.deepcopy(records)
                        best_throughput = best_throughput_tmp

                    latency_list = [r.latency for r in records]
                    old_slow_idx = resource_alloc["slow_idx"][0]
                    old_slow_record_latency = resource_alloc["array_latency"][old_slow_idx]
                    slow, fast = self.update_bottleneck_idx(records)
                    resource_alloc["slow_idx"] = slow
                    resource_alloc["fast_idx"] = fast
                    resource_alloc["array_latency"] = [record.latency for record in records]

                    ## For internal testing
                    #print("****************** Tuning ******************")
                    #latency, throughput, meta = self.est_latency(partition, records, verbose=1)
                    #print("total_latency: ", meta["total_latency"])
                    #print("latency: ", latency)
                    #print("throughput: ", throughput)
                    #print("round_info: ")
                    #pprint.pprint(meta["round_info"])
                    #memory, meta = self.est_mem(partition, records, verbose=1)
                    #print("memory: ", memory)
                    #print("array_bufs: ", meta["array_bufs"])
                    #print("stream_bufs: ", meta["stream_bufs"])
                    #latency_list = [r.latency for r in records]
                    #dsp_list = [r.cst["DSP"] for r in records]
                    #dsf_eff_list = [r.dsp_eff for r in records]
                    #bram_list = [r.cst["BRAM18K"] for r in records]
                    #kernel_list = [r.design for r in records]
                    #print("latency list: ", latency_list)
                    #print("bram list: ", bram_list)
                    #print("dsp list: ", dsp_list)
                    #print("dsp eff list: ", dsf_eff_list)
                    #print("kernel list: ", kernel_list)
                    #print("****************** Tuning ******************")

                    if resource_alloc["slow_idx"][0] == old_slow_idx:
                        # If the performance is not improved upon the last time, break as well
                        if records[i].latency <= old_slow_record_latency:
                            break
                        if not adjust_func(partition, resource_alloc, records, 0):
                            break
                    else:
                        break
            else:
                if len(records) == 0:
                    resource_alloc["BRAM18K"] = [n / 2 for n in resource_alloc["BRAM18K"]]
                    #resource_alloc["DSP"] = [n / 2 for n in resource_alloc["DSP"]]
                else:
                    legal_records = records
                    break

        return legal_records

    def search_design(self, partition_idx):
        partition_idx = int(partition_idx)
        if partition_idx in self.bay_search_log:
            return self.bay_search_log[partition_idx]
        #n_array = int(n_array)
        #if n_array in self.bay_search_log:
        #    return self.bay_search_log[n_array]
        self.log(f"Partition {partition_idx}: {self.params['partition_candidates'][partition_idx]['partition']}, #Array: {len(self.params['partition_candidates'][partition_idx]['partition'])}")
        #self.log(f"#Array: {n_array}")
        rewards_window = []
        self.counter.init_counter('local_time')
        local_best_reward = 0
        # Build the partition
        partition = self.params['partition_candidates'][partition_idx]['partition']
        n_arrays = len(partition)
        #partition = [[] for i in range(n_array)]
        #for i in range(len(self.search_task.tasks)):
        #    array_idx = i % n_array
        #    partition[array_idx].append(i)
        # Store all the search records for each array
        for i in range(n_arrays):
            self.search_cache[i] = []
        # Store the resource constraint used for each search to avoid redundant search
        for i in range(n_arrays):
            self.search_cache_cst[i] = []

        # Initialize resource allocation
        resource_alloc = self.resource_alloc(partition)

        # Find a legal config
        records = self.find_legal_config(partition, resource_alloc, skip_search=0)
        if records:
            self.local_epoch = 0
            self.last_update_epoch = 0
            last_slow_idx = -1
            while True:
                latency, used_constraints, throughput, meta = self.evaluate(partition, records, verbose=1)
                dsp_eff = self.est_dsp_eff(throughput, used_constraints)                
                reward = throughput
                search_record = utils.SearchRecord().extract_from_tuner_multi_acc(records, reward, latency, used_constraints, throughput, dsp_eff, partition=partition)
                # Update global reward
                if reward > self.best_reward:
                    self.best_reward = reward
                    self.best_search_record = search_record                    
                    self.log(f'Global Epoch {self.epoch} - #Array {n_arrays}: new global best reward: {self.best_reward} (latency: {latency:.0f}, throughput: {throughput}, DSP eff: {dsp_eff:.2f}, BRAM: {used_constraints["BRAM18K"]:.2f}, DSP: {used_constraints["DSP"]:.2f}, URAM: {used_constraints["URAM"]:.2f}, BW: {search_record.bw:.2f})')
                self.best_rewards.append(self.best_reward)
                self.counter.update_counter('time')
                self.best_rewards_time.append(self.counter.get_counter('time'))
                # Update local reward
                if reward > local_best_reward:
                    local_best_reward = reward
                    self.log(f'Local Epoch {self.local_epoch} - #Array {n_arrays}: new local best reward: {self.best_reward} (latency: {latency:.0f}, throughput: {throughput}, DSP eff: {dsp_eff:.2f}, BRAM: {used_constraints["BRAM18K"]:.2f}, DSP: {used_constraints["DSP"]:.2f}, URAM: {used_constraints["URAM"]:.2f}, BW: {search_record.bw:.2f})')
                    self.last_update_epoch = self.local_epoch
                rewards_window.append(reward)

                if len(rewards_window) > self.params["max_trial"]:
                    stdev_percent = np.std(rewards_window[-3:]) / np.mean(rewards_window[-3:])
                    if stdev_percent < self.params["reward_stdev_thres"]:
                        self.log(f'Minimal improvement after {self.params["max_trial"]} rounds, terminated')
                        break
                if self.local_epoch - self.last_update_epoch > self.params["max_trial"]:
                    self.log(f'No improvement after {self.params["max_trial"]} rounds, terminated')
                    break
                # If the tuning time is too long, kill it
                self.counter.update_counter('local_time')
                if self.counter.get_counter("local_time") > self.max_time:
                    self.log('Time out, terminated')
                    break

                # Fine-tuning
                if self.is_finetune_required(records, dsp_eff):
                    # Find fastest/slowest design index
                    slow, fast = self.update_bottleneck_idx(records)
                    # Update resource alloc to reflect the current usage
                    for i in range(len(records)):
                        resource_alloc['DSP'][i] = np.ceil(records[i].cst['DSP'])
                        resource_alloc['BRAM18K'][i] = np.ceil(records[i].cst['BRAM18K'])
                    # Adjust resource alloc
                    resource_alloc["init"] = {"DSP": copy.deepcopy(resource_alloc['DSP']),
                                              "BRAM18K": copy.deepcopy(resource_alloc['BRAM18K']),
                                              "DSP_total": used_constraints['DSP'],
                                              "BRAM18K_total": used_constraints['BRAM18K'],
                                              "URAM_total": used_constraints['URAM'],
                                              }
                    resource_alloc["array_latency"] = [record.latency for record in records]
                    resource_alloc["state"] = 0
                    if slow[0] == last_slow_idx:
                        resource_alloc["state"] = 1
                    resource_alloc["slow_idx"] = slow
                    resource_alloc["fast_idx"] = fast
                    resource_alloc["step"] = [[0, 1], [0.025]] # step for resource adjustment
                    resource_alloc["n_adjust"] = [0, 0] # number of attempts at each state
                    resource_alloc["decrease"] = [-1, -1] # indicate if the allocation of bram decreases in the previous round
                    resource_alloc["history"] = [0, 0] # bram allocation in the last round
                    last_slow_idx = slow[0]
                    if not self.resource_alloc_adjust(partition, resource_alloc, records, 0):
                        self.log('No valid resource allocation found, terminated')
                        break
                    records = self.find_legal_config(partition, resource_alloc, old_records=records, adjust_func=self.resource_alloc_adjust, fine_tune=1, skip_search=0)
                    if not records:
                        self.log('No valid records found, terminated')
                        break
                else:
                    self.log('Fine-tuning not required, terminated')
                    break

                self.epoch += 1
                self.local_epoch += 1

        self.bay_search_log[partition_idx] = local_best_reward
        return local_best_reward

    def search(self):
        self.n_layers = len(self.search_task.tasks)
        if self.n_layers < 2:
            raise RuntimeError("Multi-acc exploration requires at least two conv layers.")
        self.counter.init_counter('time')
        # Bayesian Tuner
        #pbounds = {'n_array': (2, min(self.n_layers, self.params["n_array_max"]))} # Right included
        pbounds = {'partition_idx': (0, len(self.params["partition_candidates"]) - 1)} # Right included

        bay_tuner = BayesianOptimization(
            f=self.search_design,
            pbounds=pbounds,
            random_state=1
        )
        for probe_idx in self.params['probe_points']:
            bay_tuner.probe(
                params=[probe_idx],
                lazy=True
            )
        bay_tuner.maximize(
            init_points=0,
            n_iter=10
        )


================================================
FILE: autosa_scripts/odyssey/unit_test.py
================================================

import copy
from search_task import SingleTask
from design import Design
import json
from tuners import Constraint

class Workload(object):
    def __init__(self, params):
        self.params = params

    def __repr__(self):
        return f"{self.params}"

class SearchTask(object):
    def __init__(self, workload):
        self.workload = workload

    def __repr__(self):
        return str(self.workload)

def est_mm_performance():
    params = {
        "i": 1024, "i_t1": 129, "i_t2": 3,
        "j": 1024, "j_t1": 130, "j_t2": 13,
        "k": 1024, "k_t1": 64, "k_t2": 4,
        "p9": 16, "p10": 16, "p11": 4, "p12": 4 # A, B, None, C
    }

    # comp
    #params = {
    #    "i": 1024, "i_t1": 520, "i_t2": 26,
    #    "j": 1024, "j_t1": 520, "j_t2": 26,
    #    "k": 1024, "k_t1": 320, "k_t2": 4,
    #    "p9": 16, "p10": 16, "p11": 4, "p12": 4 # A, B, None, C
    #}

    # comm
    #params = {
    #    "i": 1024, "i_t1": 1024, "i_t2": 128,
    #    "j": 1024, "j_t1": 1024, "j_t2": 128,
    #    "k": 1024, "k_t1": 320, "k_t2": 4,
    #    "p9": 16, "p10": 16, "p11": 4, "p12": 4 # A, B, None, C
    #}

    # comm-comp
    #params = {
    #    "i": 1024, "i_t1": 1024, "i_t2": 64,
    #    "j": 1024, "j_t1": 1024, "j_t2": 64,
    #    "k": 1024, "k_t1": 320, "k_t2": 4,
    #    "p9": 16, "p10": 16, "p11": 4, "p12": 4 # A, B, None, C
    #}

    workload = {
        "name": "gemm",
        "tags": ["gemm"],
        "params": {
            "i": 1024, "j": 1024, "k": 1024
        }
    }

    cst = Constraint("cst/hw_cst.json")

    design_dir = "designs"
    kernel_name = "kernel3_2"
    with open(f"designs/{kernel_name}.json", "r") as json_f:
        desp = json.load(json_f)
    design = Design(kernel_name)
    design.register(desp, f"designs/register/{kernel_name}.py")

    search_task = SingleTask(design, workload, cst)
    reward, resource, meta = search_task.evaluate(params)
    print(1 / reward)
    print(resource)
    print(meta)

if __name__ == "__main__":
    est_mm_performance()


================================================
FILE: autosa_scripts/odyssey/utils.py
================================================
import time
import functools
import math
import logging
import itertools
from datetime import datetime
from subprocess import Popen, PIPE
import json
import pprint
import queue
import multiprocessing as mp
from pathos.pools import ProcessPool, ParallelPool
import copy

def factorization(x):
    if x == 0:
        raise RuntimeError(f"Factorization of 0")
    prime_factors = []
    while x % 2 == 0:
        prime_factors.append(2)
        x = x / 2
    
    for i in range(3, int(math.sqrt(x)) + 1, 2):
        while x % i == 0:
            prime_factors.append(int(i))
            x = x / i
    
    if x > 2:
        prime_factors.append(int(x))

    return prime_factors

def get_divisors(x, filter=None):
    """ Return the divisors of the integer x
    Call the filter function to filter out the illegal one.
    """
    divisors = []
    large_divisors = []
    for i in range(1, int(math.sqrt(x) + 1)):
        if x % i == 0:
            if (filter and not filter(i)) or not filter:
                divisors.append(int(i))
            if i * i != x:
                if (filter and not filter(int(x / i))) or not filter:
                    large_divisors.append(int(x / i))
    for d in reversed(large_divisors):
        divisors.append(d)

    return divisors

def compute_tasks_latency(search_tasks, init_tasks):
    """ Aggregate the best latency of the search tasks.
    """
    # Collect the best single task latency
    task_latency = {}
    for task in search_tasks:
        found = False
        cur_latency = []
        task_prefix = str(task)[:str(task).find('d')]
        for i_task in init_tasks:
            if len(i_task.task_sols) == 1:
                i_task_prefix = i_task.task_sols[0]['hash']
                i_task_prefix = i_task_prefix[:i_task_prefix.find('d')]
                if i_task_prefix == task_prefix:
                    found = True
                    cur_latency.append(i_task.task_sols[0]['latency'])
        if not found:
            #raise RuntimeError(f"Task {str(task)} not found in the history.")
            return None
        task_latency[task.workload["name"]] = min(cur_latency)
    
    # Init tasks may contain fused tasks.
    # If the fused tasks help improve the latency, we will replace the old 
    # unfused task pairs with the fused tasks.
    for i_task in init_tasks:
        if len(i_task.task_sols) > 1:
            unfused_latency = 0
            for name in i_task.task_names:
                if name not in task_latency:
                    # This task has been handled by other fusion tasks
                    unfused_latency = 0
                    break
                unfused_latency += task_latency[name]
            if i_task.latency < unfused_latency:
                task_latency[''.join(i_task.task_names)] = i_task.latency
                for name in i_task.task_names:
                    del task_latency[name]

    latency = 0
    for k, v in task_latency.items():
        latency += v

    return latency

class PerfCounter(object):
    def __init__(self, logger=None):
        self.logger = logger
        self.counters = {}
    
    def init_counter(self, name):        
        self.counters[name] = {'start': time.perf_counter(), 'elapsed': 0}
        
    def update_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        now = time.perf_counter()
        self.counters[name]['elapsed'] += (now - self.counters[name]['start'])
        self.counters[name]['start'] = now

    def get_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        return self.counters[name]['elapsed']

    def print_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        if not self.logger:
            raise RuntimeError(f"Logger is not defined")
        self.logger.info(f'[Event: {name}] Total elapsed time: {self.counters[name]["elapsed"]:.4f} s')

    def print_counters(self):
        if not self.logger:
            raise RuntimeError(f"Logger is not defined")
        for name in self.counters:
            self.logger.info(f'[Event: {name}] Total elapsed time: {self.counters[name]["elapsed"]:.4f} s')    

def init_logger(outdir):	
    logger = logging.getLogger('AutoSA-Tuner')
    # If there is already any handlers, remove them	
    for handler in logger.handlers[:]:
        handler.close()
        logger.removeHandler(handler)
    formatter = logging.Formatter(
                '[%(name)s %(asctime)s] %(levelname)s: %(message)s',
                '%Y-%m-%d %H:%M:%S')
    logger.setLevel(logging.INFO)
    s_handler = logging.StreamHandler()    	
    f_handler = logging.FileHandler(f'{outdir}/tuning.log', 'a')
    s_handler.setLevel(level=logging.INFO)
    f_handler.setLevel(level=logging.INFO)    
    s_handler.setFormatter(formatter)
    f_handler.setFormatter(formatter)
    logger.addHandler(s_handler)
    logger.addHandler(f_handler)
    
    return logger    

class SearchRecord(object):
    """ Data struct for storing the searching results
    """
    def __init__(self, max=1):
        self.cst = None
        self.max = max
        if self.max == 1:
            self.reward = 0
        else:
            self.reward = float("inf")
        self.reward_meta = None
        self.latency = 0
        self.throughput = 0
        self.energy = 0
        self.dsp_eff = 0
        self.design = None
        self.ops = 0
        self.task_names = []
        self.metric = None
        self.fuse = -1
        self.split_pos = -1
        self.partition = None
        self.n_array = -1
        self.bw = 0
        self.ctc = 0
        self.exec_model = []
        self.converge_time = 0
        # Design frequency
        self.fre = 300 
        self.off_chip_trans = 0
        self.dw = 4 # Float
        self.valid = 0        
        
        # Fixed array architecture solution
        self.arch_sol = None
        # Mapped tasks solutions
        self.task_sols = []
        # Sub task records
        self.records = None
        self.history = []

    def reset(self):
        self.cst = None        
        if self.max == 1:
            self.reward = 0
        else:
            self.reward = float("inf")
        self.reward_meta = None
        self.latency = 0
        self.throughput = 0
        self.energy = 0
        self.dsp_eff = 0
        self.design = None
        self.ops = 0
        self.task_names = []
        self.metric = None
        self.fuse = -1
        self.split_pos = -1
        self.partition = None
        self.n_array = -1
        self.bw = 0
        self.ctc = 0
        self.exec_model = []
        self.converge_time = 0
        # Design frequency
        self.fre = 300 
        self.off_chip_trans = 0
        self.valid = 0

        self.arch_sol = None
        self.task_sols = []
        self.records = None        
        self.history = []

        return self

    def update(self, new_record, save=0):  
        """ Update the old records if new record is better.
        If "save" is set to 1, store the current record to history.
        """
        if new_record.valid == 0:
            return False

        if self.max != new_record.max:
            raise RuntimeError("Inconsistent search record configuration")
        status = False
        if self.max == 1:
            if new_record.reward > self.reward:				
                status = True
        else:
            if new_record.reward < self.reward:
                status = True
        if status:
            self.cst = copy.deepcopy(new_record.cst)
            self.reward = new_record.reward
            self.reward_meta = copy.deepcopy(new_record.reward_meta)
            self.latency = new_record.latency
            self.throughput = new_record.throughput
            self.energy = new_record.energy
            self.dsp_eff = new_record.dsp_eff
            self.design = new_record.design            
            self.ops = new_record.ops
            self.task_names = new_record.task_names
            self.fuse = new_record.fuse
            self.split_pos = new_record.split_pos
            self.partition = new_record.partition
            self.n_array = new_record.n_array
            self.bw = new_record.bw
            self.ctc = new_record.ctc
            self.exec_model = new_record.exec_model
            self.metric = new_record.metric
            self.converge_time = new_record.converge_time
            self.off_chip_trans = new_record.off_chip_trans
            self.valid = new_record.valid            

            self.arch_sol = new_record.arch_sol
            self.task_sols = new_record.task_sols
            self.records = new_record.records
        
        if save == 1:
            self.history.append(new_record)

        return status

    def dup(self):
        """ Duplicate the current record.
        """
        new_record = SearchRecord()
        new_record.cst = copy.deepcopy(self.cst)
        new_record.max = self.max
        new_record.reward = self.reward
        new_record.reward_meta = self.reward_meta
        new_record.latency = self.latency
        new_record.throughput = self.throughput
        new_record.energy = self.energy
        new_record.dsp_eff = self.dsp_eff
        new_record.design = self.design
        new_record.ops = self.ops
        new_record.task_names = copy.deepcopy(self.task_names)
        new_record.metric = self.metric
        new_record.fuse = self.fuse
        new_record.split_pos = self.split_pos
        new_record.partition = self.partition
        new_record.n_array = self.n_array
        new_record.bw = self.bw
        new_record.ctc = self.ctc
        new_record.exec_model = copy.deepcopy(self.exec_model)
        new_record.converge_time = self.converge_time
        new_record.off_chip_trans = self.off_chip_trans
        new_record.valid = self.valid
        new_record.arch_sol = copy.deepcopy(self.arch_sol)
        new_record.task_sols = copy.deepcopy(self.task_sols)
        if self.records:
            new_record.records = []
            for record in self.records:
                new_record.records.append(record.dup())
        
        return new_record

    def extract_from_tuner_single_acc(self, tuner):
        """ Extract the sinlge accelerator search results from the tuner.
        """
        if tuner.best_sol:
            self.cst = tuner.best_sol_cst
            self.reward = tuner.best_reward
            self.reward_meta = tuner.best_reward_meta
            self.ops = tuner.search_task.compute_ops()
            if tuner.search_obj == "latency":
                self.latency = 1 / self.reward
                self.throughput = self.ops / self.latency
                # Compute the updated DSP efficiency
                # Note: Only applicable for FP32
                self.dsp_eff = tuner.search_task.compute_dsp_eff(self.latency, self.cst["DSP"])
            elif tuner.search_obj in ["off_chip_comm", "dsp_num"]:
                self.latency = self.reward_meta["latency"]["latency"]
                self.throughput = self.ops / self.latency
                self.dsp_eff = tuner.search_task.compute_dsp_eff(self.latency, self.cst["DSP"])
            elif tuner.search_obj == "energy":
                self.energy = 1 / self.reward
                self.latency = self.reward_meta["latency"]["latency"]
                self.throughput = self.ops / self.latency
                self.dsp_eff = tuner.search_task.compute_dsp_eff(self.latency, self.cst["DSP"])
            else:
                raise RuntimeError("Unsupported search objective: ", tuner.search_obj)
            self.design = tuner.search_task.design.name            
            self.task_names = [tuner.search_task.workload["name"]]
            #self.fuse = tuner.search_task.fuse
            self.split_pos = -1
            self.metric = tuner.search_obj
            self.bw = tuner.search_task.compute_bw(tuner.best_sol)
            self.ctc = tuner.search_task.compute_ctc(tuner.best_sol)
            self.exec_model.append(tuner.search_task.workload["name"])
            self.converge_time = tuner.converge_time
            self.off_chip_trans = tuner.search_task.est_off_chip_trans(tuner.best_sol)

            # Solutions
            self.arch_sol = tuner.search_task.arch_sol
            self.task_sols = [{
                "name": tuner.search_task.workload["name"],
                "hash": str(tuner.search_task),
                "ops": tuner.search_task.compute_ops(),
                "sol": tuner.best_sol,
                "latency": self.latency,
                "CTC": self.ctc,
                "DSP_eff": self.dsp_eff,
                "reward_meta": tuner.best_reward_meta,
                "BW": self.bw
            }]            
            self.records = None

            self.valid = 1

        return self

    def extract_from_tuner_multi_acc(self, records, reward, latency, cst, throughput, dsp_eff, split_pos=-1, partition=None, n_array=-1, meta=None):
        """ Extract multi-acc search records from the tuner.
        If meta is set, this is Arch3 (multi2), we use a different method to calcualte BW.
        """
        self.valid = 1
        for record in records:
            if record.valid == 0:
                self.valid = 0
        self.cst = cst
        self.latency = latency
        self.reward = reward
        self.dsp_eff = dsp_eff
        self.throughput = throughput
        self.split_pos = split_pos
        self.partition = partition
        self.n_array = n_array
        self.metric = records[0].metric
        for record in records:
            self.task_names += copy.deepcopy(record.task_names)
        #for record in records:
        #    self.bw += record.bw
        # Use the 1/throughput as the maximal latency
        # Accumulate the total data communication for all the arrays 
        # For single-workload array, check if the on-chips streaming buffers are used.
        if not meta:
            max_latency = 1 / throughput
            total_off_chip_trans = 0
            for record in records:
                total_off_chip_trans += record.off_chip_trans 
            self.bw = total_off_chip_trans * self.dw / (max_latency / (self.fre * 1e6)) / 1e9 # GB/s
        else:
            bw = 0
            for r in range(len(meta['round_info'])):
                total_off_chip_trans = meta['round_info'][r]['total_off_chip_trans']
                round_latency = meta['round_info'][r]['latency']
                bw = max(bw, total_off_chip_trans * self.dw / (round_latency / (self.fre * 1e6)) / 1e9)
            self.bw = bw                

        self.records = copy.deepcopy(records)

        return self

    def __repr__(self):
        return self.to_str()

    def to_str(self):
        to_print = ""
        if self.valid:        
            to_print += f"\nreward: {self.reward}"
            #to_print += f"\nreward meta: {self.reward_meta}"
            to_print += f"\ncst: {pprint.pformat(self.cst, indent=2)}"
            to_print += f"\nlatency: {self.latency}"
            to_print += f"\nthroughput: {self.throughput}"            
            to_print += f"\nenergy(mJ/normalized): {self.energy:.6f}"
            to_print += f"\nDSP efficiency: {self.dsp_eff:.2f}"
            to_print += f"\nBW(GB/s): {self.bw:.2f}"
            to_print += f"\nops: {self.ops:.2f}"
            to_print += f"\nCTC(FLOP/byte): {self.ctc:.2f}"
            to_print += f"\ndesign: {self.design}"
            to_print += f"\nconverge time: {self.converge_time}"
            to_print += f"\noff-chip communication (Bytes): {self.off_chip_trans * self.dw}"
            if self.fuse != -1:
                to_print += f"\nfuse: {self.fuse}"
            if self.split_pos != -1:
                to_print += f"\nsplit position: {self.split_pos}"            
            if self.partition:
                to_print += f"\npartition: {self.partition}"
            if self.n_array != -1:
                to_print += f"\n#array: {self.n_array}"            
            if len(self.exec_model) > 0:
                to_print += f"\nexec model: {self.exec_model}"
            to_print += f"\ntask names: {self.task_names}"
            if self.arch_sol:
                to_print += f"\narch sol: {pprint.pformat(self.arch_sol, indent=2)}"
            if self.task_sols:
                to_print += f"\ntask sols: \n{pprint.pformat(self.task_sols, indent=2)}"
            if self.records:                
                to_print += f"\nrecords: "
                for record_idx in range(len(self.records)):
                    to_print += f"\n<record{record_idx}><begin>"
                    to_print += f"{self.records[record_idx].to_str()}"
                    to_print += f"<record{record_idx}><end>"                
            if len(self.history) > 1:
                to_print += f"\nhistory records: "
                for record_idx in range(len(self.history)):
                    to_print += f"\n<record{record_idx}><begin>"
                    to_print += f"{self.history[record_idx].to_str()}"
                    to_print += f"<record{record_idx}><end>"
        else:
            to_print += f"\ninvalid record"
        to_print += "\n"

        return to_print

    def append(self, record):
        """ Append another record to the current record.
        All the records should share the same architecture.
        We will append the task solutions of the next record to the current record.
        """
        if record.valid == 0:
            self.valid = 0

        if len(self.task_sols) == 0:
            self = copy.deepcopy(record)
        else:
            if self.max != 1:
                raise RuntimeError("Appending records is only suppported under the max mode.")
            if self.metric == "latency":
                if record.latency != 0:
                    self.dsp_eff = (self.dsp_eff * self.latency + record.dsp_eff * record.latency) / (self.latency + record.latency)
                self.latency += record.latency
                if self.latency != 0:
                    self.reward = 1 / self.latency
            else:
                raise RuntimeError(f"Unsupported metric: {self.metric}.")			
            self.ops += record.ops
            self.throughput = self.ops / self.latency
            self.off_chip_trans += record.off_chip_trans
            self.bw = max(self.bw, record.bw)
            self.task_names += copy.deepcopy(record.task_names)
            self.exec_model += copy.deepcopy(record.exec_model)

            # Solutions
            self.task_sols += copy.deepcopy(record.task_sols)

        return self

    def merge(self, record1, record2):
        """ Merge another record to the current record.
        All the records should share the same architecture.
        We will append the next record to the current record lists.
        """                
        if record1.valid == 0 or record2.valid == 0:
            self.valid = 0
            return self
                
        self.valid = 1
        # Update the metadata
        self.cst = record1.cst        
        for item in self.cst:
            if record2.cst[item] > self.cst[item]:
                self.cst[item] = record2.cst[item]
        self.metric = record1.metric
        if self.metric == "latency":
            self.latency = record1.latency + record2.latency
            self.reward = 1 / self.latency
            # Update the DSP efficiency
            self.dsp_eff = (record1.dsp_eff * record1.latency + record2.dsp_eff * record2.latency) / (record1.latency + record2.latency)
        else:
            #print(self)
            raise RuntimeError(f"Unsupported metric: {self.metric}")        
        self.ops = record1.ops + record2.ops
        self.off_chip_trans = record1.off_chip_trans + record2.off_chip_trans
        self.bw = max(record1.bw, record2.bw)        
        self.design = record1.design
        for t_name in record1.task_names:
            self.task_names.append(t_name)
        for t_name in record2.task_names:
            self.task_names.append(t_name)     

        self.exec_model = copy.deepcopy(record1.exec_model)        
        if record1.fuse == 1 or record2.fuse == 1:            
            #print(record1.exec_model)
            #print(record2.exec_model)
            #print(record1.fuse, record2.fuse)
            self.exec_model = [self.exec_model, record2.exec_model]
            #print(self.exec_model)
        else:
            self.exec_model += record2.exec_model         
        self.arch_sol = record1.arch_sol

        # Solutions                
        #new_record.records = [copy.deepcopy(self), copy.deepcopy(record)]
        #self.records = [record1, record2]
        self.records = [record1.dup(), record2.dup()]

        return self

class NoDaemonProcess(mp.Process):
	# Make "daemon" attribute always return false
	def _get_daemon(self):
		return False
	def _set_daemon(self, value):
		pass
	daemon = property(_get_daemon, _set_daemon)

class MyExecutor(object):
	def __init__(self, n_thread):
		self.n_thread = n_thread
		self.timeout = 1800 # 30 minutes
		self.task_queue = mp.Queue()
		self.ret_queue = mp.Queue()
		self.proc_list = []		
		self.ret = {}		
		if n_thread > 1:
			manager = mp.Manager()
			self.return_dict = manager.dict()
			for i in range(self.n_thread):				
				p = NoDaemonProcess(target=self.runner, args=(self.task_queue, self.return_dict))
				self.proc_list.append(p)			
			for i in range(self.n_thread):
				self.proc_list[i].start()
	
	def runner(self, q, return_dict):
		while True:
			task = q.get()
			if task is None:
				break
			task_hash = task[0]
			task_func = task[1]
			task_args = task[2]
			ret = task_func(*task_args)			
			return_dict[task_hash] = ret

	def prune_jobs(self, jobs):
		""" Prune jobs with the same hash
		"""
		job_list = []
		cache = []

		for job in jobs:
			if job['job_hash'] in cache:
				continue
			else:
				job_list.append(job)
				cache.append(job['job_hash'])

		return job_list	

	def exec(self, job_list):
		""" Submit the job to the executor.
		job and job_args are both lists.
		Return a list of job results.
		"""				
		# Prune away redundant jobs
		job_list = self.prune_jobs(job_list)			
				
		results = {}
		if self.n_thread > 1:			
			for job in job_list:
				self.task_queue.put((job['job_hash'], job['func'], job['args']))			
			for i in range(self.n_thread):
				self.task_queue.put(None)			
			start = time.time()
			while time.time() - start <= self.timeout:				
				if not any(p.is_alive() for p in self.proc_list):
					break
				time.sleep(.1)				
			else:
				# Timeout, kill all the processes
				for p in self.proc_list:
					p.terminate()								
			for p in self.proc_list:
				p.join()			
			
			for job in job_list:
				if job['job_hash'] in self.return_dict:
					results[job['job_hash']] = self.return_dict[job['job_hash']]
				else:
					results[job['job_hash']] = SearchRecord().reset()
		else:
			for job in job_list:
				job_args = job['args']
				results[job['job_hash']] = job['func'](*job_args)
		
		return results

================================================
FILE: autosa_scripts/odyssey/workload/conv.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["conv"],
      "params": {
        "i": 1,
        "o": 6,
        "r": 5,
        "c": 5,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mm.json
================================================
{
  "workloads": [
    {
      "name": "gemm",
      "tags": ["gemm"],
      "params": {
        "i": 1024,
        "j": 1024,
        "k": 1024
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mm64.json
================================================
{
  "workloads": [
    {
      "name": "gemm",
      "tags": ["gemm"],
      "params": {
        "i": 64,
        "j": 64,
        "k": 64
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 3,
        "o": 32,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 32,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 16,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 96,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 960,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 960,
        "o": 320,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 320,
        "o": 1280,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_1.json
================================================
{
    "workloads": [
        {
            "name": "conv1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 3,
                "o": 32,
                "r": 112,
                "c": 112,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_10.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 32,
                "o": 144,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_11.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 144,
                "o": 32,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_12.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 32,
                "o": 144,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_13.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 144,
                "o": 32,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_14.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-0",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 32,
                "o": 192,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_15.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 192,
                "o": 64,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_16.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 192,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_17.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 192,
                "o": 64,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_18.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 192,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_19.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 192,
                "o": 64,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_2.json
================================================
{
    "workloads": [
        {
            "name": "conv2_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 32,
                "o": 32,
                "r": 112,
                "c": 112,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_20.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 192,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_21.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 192,
                "o": 64,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_22.json
================================================
{
    "workloads": [
        {
            "name": "conv6_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 384,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_23.json
================================================
{
    "workloads": [
        {
            "name": "conv6_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 384,
                "o": 96,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_24.json
================================================
{
    "workloads": [
        {
            "name": "conv6_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 96,
                "o": 384,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_25.json
================================================
{
    "workloads": [
        {
            "name": "conv6_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 384,
                "o": 96,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_26.json
================================================
{
    "workloads": [
        {
            "name": "conv6_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 96,
                "o": 384,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_27.json
================================================
{
    "workloads": [
        {
            "name": "conv6_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 384,
                "o": 96,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_28.json
================================================
{
    "workloads": [
        {
            "name": "conv7_1-0",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 96,
                "o": 576,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_29.json
================================================
{
    "workloads": [
        {
            "name": "conv7_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 576,
                "o": 160,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_3.json
================================================
{
    "workloads": [
        {
            "name": "conv2_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 32,
                "o": 16,
                "r": 112,
                "c": 112,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_30.json
================================================
{
    "workloads": [
        {
            "name": "conv7_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 160,
                "o": 576,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_31.json
================================================
{
    "workloads": [
        {
            "name": "conv7_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 576,
                "o": 160,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_32.json
================================================
{
    "workloads": [
        {
            "name": "conv7_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 160,
                "o": 576,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_33.json
================================================
{
    "workloads": [
        {
            "name": "conv7_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 576,
                "o": 160,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_34.json
================================================
{
    "workloads": [
        {
            "name": "conv8_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 160,
                "o": 960,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_35.json
================================================
{
    "workloads": [
        {
            "name": "conv8_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 960,
                "o": 320,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_36.json
================================================
{
    "workloads": [
        {
            "name": "conv9",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 320,
                "o": 1280,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_4.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-0",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 16,
                "o": 96,
                "r": 112,
                "c": 112,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_47.json
================================================
{
  "workloads": [
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 96,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_5.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 96,
                "o": 24,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_6.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 24,
                "o": 96,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_7.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 96,
                "o": 24,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_8.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-0",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 24,
                "o": 144,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_9.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 144,
                "o": 32,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_complete.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 3,
        "o": 32,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 32,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 16,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 16,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 24,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 24,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 32,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 32,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 32,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 96,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 96,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 960,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 960,
        "o": 320,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 320,
        "o": 1280,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv10",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1280,
        "o": 1000,
        "r": 1,
        "c": 1,
        "p": 1,
        "q": 1
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_conv3_1_0.json
================================================
{
  "workloads": [    
    {
      "name": "conv3_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    }    
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_first.json
================================================
{
  "workloads": [
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_first1.json
================================================
{
  "workloads": [
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },    
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    }    
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_first2.json
================================================
{
  "workloads": [    
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },    
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_half.json
================================================
{
  "workloads": [
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    }  
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_img2col.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 32,
        "j": 50176,
        "k": 27
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 32,
        "j": 12544,
        "k": 288
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 16,
        "j": 12544,
        "k": 288
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 96,
        "j": 12544,
        "k": 144
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 24,
        "j": 3136,
        "k": 864
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 96,
        "j": 12544,
        "k": 144
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 24,
        "j": 3136,
        "k": 864
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 144,
        "j": 3136,
        "k": 216
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 32,
        "j": 784,
        "k": 1296
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 144,
        "j": 3136,
        "k": 216
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 32,
        "j": 784,
        "k": 1296
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 144,
        "j": 3136,
        "k": 216
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 32,
        "j": 784,
        "k": 1296
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 192,
        "j": 784,
        "k": 288
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 196,
        "k": 1728
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 192,
        "j": 784,
        "k": 288
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 196,
        "k": 1728
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 192,
        "j": 784,
        "k": 288
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 196,
        "k": 1728
      }
    },
    {
      "name": "conv5_1-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 192,
        "j": 784,
        "k": 288
      }
    },
    {
      "name": "conv5_3-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 196,
        "k": 1728
      }
    },
    {
      "name": "conv6_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 384,
        "j": 196,
        "k": 576
      }
    },
    {
      "name": "conv6_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 96,
        "j": 196,
        "k": 3456
      }
    },
    {
      "name": "conv6_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 384,
        "j": 196,
        "k": 576
      }
    },
    {
      "name": "conv6_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 96,
        "j": 196,
        "k": 3456
      }
    },
    {
      "name": "conv6_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 384,
        "j": 196,
        "k": 576
      }
    },
    {
      "name": "conv6_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 96,
        "j": 196,
        "k": 3456
      }
    },
    {
      "name": "conv7_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 576,
        "j": 196,
        "k": 864
      }
    },
    {
      "name": "conv7_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 160,
        "j": 49,
        "k": 5184
      }
    },
    {
      "name": "conv7_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 576,
        "j": 196,
        "k": 864
      }
    },
    {
      "name": "conv7_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 160,
        "j": 49,
        "k": 5184
      }
    },
    {
      "name": "conv7_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 576,
        "j": 196,
        "k": 864
      }
    },
    {
      "name": "conv7_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 160,
        "j": 49,
        "k": 5184
      }
    },
    {
      "name": "conv8_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 960,
        "j": 49,
        "k": 1440
      }
    },
    {
      "name": "conv8_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 320,
        "j": 49,
        "k": 8640
      }
    },
    {
      "name": "conv9",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1280,
        "j": 49,
        "k": 2880
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_no_first.json
================================================
{
  "workloads": [
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 32,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 16,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 96,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 960,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 960,
        "o": 320,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 320,
        "o": 1280,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_original.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 3,
        "o": 32,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 32,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 16,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 16,
        "o": 96,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 24,
        "o": 96,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 24,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 24,
        "o": 144,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 32,
        "o": 144,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 144,
        "o": 32,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 32,
        "o": 192,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 192,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 192,
        "o": 64,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 96,
        "o": 384,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv6_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 384,
        "o": 96,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-0",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 96,
        "o": 576,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 576,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv7_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 576,
        "o": 160,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 160,
        "o": 960,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv8_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 960,
        "o": 320,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 320,
        "o": 1280,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_test.json
================================================
{
    "workloads": [      
      {
        "name": "conv2_1-0",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 32,
          "o": 32,
          "r": 112,
          "c": 112,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv2_3-0",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 32,
          "o": 16,
          "r": 112,
          "c": 112,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv3_1-0",
        "tags": [
          "conv",
          "maxpool_2"
        ],
        "params": {
          "i": 16,
          "o": 96,
          "r": 112,
          "c": 112,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv3_3-0",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 96,
          "o": 24,
          "r": 56,
          "c": 56,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv3_1-1",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 24,
          "o": 96,
          "r": 56,
          "c": 56,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv3_3-1",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 96,
          "o": 24,
          "r": 56,
          "c": 56,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv4_1-0",
        "tags": [
          "conv",
          "maxpool_2"
        ],
        "params": {
          "i": 24,
          "o": 144,
          "r": 56,
          "c": 56,
          "p": 1,
          "q": 1
        }
      },
      {
        "name": "conv4_3-0",
        "tags": [
          "conv"
        ],
        "params": {
          "i": 144,
          "o": 32,
          "r": 28,
          "c": 28,
          "p": 1,
          "q": 1
        }
      }
    ]
  }
  

================================================
FILE: autosa_scripts/odyssey/workload/mobilenetv2_test_single.json
================================================
{
    "workloads": [          
      {
        "name": "conv3_1-0",
        "tags": [
          "conv",
          "maxpool_2"
        ],
        "params": {
          "i": 16,
          "o": 96,
          "r": 112,
          "c": 112,
          "p": 1,
          "q": 1
        }
      }
    ]
  }
  

================================================
FILE: autosa_scripts/odyssey/workload/resnet152.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_4"
      ],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 7,
        "q": 7
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-2",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-7",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-7",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-7",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-6",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-7",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-7",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-7",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-8",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-8",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-8",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-9",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-10",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-10",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-10",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-11",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-11",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-11",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-12",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-12",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-12",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-13",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-13",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-13",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-14",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-14",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-14",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-15",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-15",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-15",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-16",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-16",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-16",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-17",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-17",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-17",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-18",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-18",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-18",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-19",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-19",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-19",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-20",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-20",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-20",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-21",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-21",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-21",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-22",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-22",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-22",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-23",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-23",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-23",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-24",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-24",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-24",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-25",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-25",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-25",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-26",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-26",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-26",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-27",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-27",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-27",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-28",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-28",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-28",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-29",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-29",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-29",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-30",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-30",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-30",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-31",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-31",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-31",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-32",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-32",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-32",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-33",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-33",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-33",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-34",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-34",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-34",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-35",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-35",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-35",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 3,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 7,
        "q": 7
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-2",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-3",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/resnet50_1.json
================================================
{
    "workloads": [
        {
            "name": "conv1",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 3,
                "o": 64,
                "r": 112,
                "c": 112,
                "p": 7,
                "q": 7
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_10.json
================================================
{
    "workloads": [
        {
            "name": "conv2_3-2",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 64,
                "o": 256,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_11.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_12.json
================================================
{
    "workloads": [
        {
            "name": "conv3_2-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_13.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 512,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_14.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_15.json
================================================
{
    "workloads": [
        {
            "name": "conv3_2-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_16.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 512,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_17.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_18.json
================================================
{
    "workloads": [
        {
            "name": "conv3_2-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_19.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 512,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_2.json
================================================
{
    "workloads": [
        {
            "name": "conv2_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_20.json
================================================
{
    "workloads": [
        {
            "name": "conv3_1-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_21.json
================================================
{
    "workloads": [
        {
            "name": "conv3_2-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 128,
                "o": 128,
                "r": 28,
                "c": 28,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_22.json
================================================
{
    "workloads": [
        {
            "name": "conv3_3-3",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 128,
                "o": 512,
                "r": 28,
                "c": 28,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_23.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_24.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_25.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_26.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_27.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_28.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_29.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_3.json
================================================
{
    "workloads": [
        {
            "name": "conv2_2-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_30.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_31.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_32.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_33.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_34.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-3",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_35.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-4",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_36.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-4",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_37.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-4",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_38.json
================================================
{
    "workloads": [
        {
            "name": "conv4_1-5",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_39.json
================================================
{
    "workloads": [
        {
            "name": "conv4_2-5",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 256,
                "r": 14,
                "c": 14,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_4.json
================================================
{
    "workloads": [
        {
            "name": "conv2_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 256,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_40.json
================================================
{
    "workloads": [
        {
            "name": "conv4_3-5",
            "tags": [
                "conv",
                "maxpool_2"
            ],
            "params": {
                "i": 256,
                "o": 1024,
                "r": 14,
                "c": 14,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_41.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 1024,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_42.json
================================================
{
    "workloads": [
        {
            "name": "conv5_2-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_43.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-0",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 2048,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_44.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 2048,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_45.json
================================================
{
    "workloads": [
        {
            "name": "conv5_2-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_46.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 2048,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_47.json
================================================
{
    "workloads": [
        {
            "name": "conv5_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 2048,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_48.json
================================================
{
    "workloads": [
        {
            "name": "conv5_2-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 512,
                "r": 7,
                "c": 7,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_49.json
================================================
{
    "workloads": [
        {
            "name": "conv5_3-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 512,
                "o": 2048,
                "r": 7,
                "c": 7,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_5.json
================================================
{
    "workloads": [
        {
            "name": "conv2_1-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_6.json
================================================
{
    "workloads": [
        {
            "name": "conv2_2-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_7.json
================================================
{
    "workloads": [
        {
            "name": "conv2_3-1",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 256,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_8.json
================================================
{
    "workloads": [
        {
            "name": "conv2_1-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 256,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 1,
                "q": 1
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_9.json
================================================
{
    "workloads": [
        {
            "name": "conv2_2-2",
            "tags": [
                "conv"
            ],
            "params": {
                "i": 64,
                "o": 64,
                "r": 56,
                "c": 56,
                "p": 3,
                "q": 3
            }
        }
    ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_batch4.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_4"
      ],
      "params": {
        "i": 3,
        "o": 64,
        "r": 448,
        "c": 448,
        "p": 7,
        "q": 7
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-2",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 112,
        "c": 112,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-3",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/resnet50_conv5_1.json
================================================
{
  "workloads": [
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_img2col.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 50176,
        "k": 147
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv2_2-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 576
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv2_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv2_2-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 576
      }
    },
    {
      "name": "conv2_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv2_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv2_2-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 3136,
        "k": 576
      }
    },
    {
      "name": "conv2_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 64
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 256
      }
    },
    {
      "name": "conv3_2-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 1152
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 128
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 256
      }
    },
    {
      "name": "conv3_2-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 1152
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 128
      }
    },
    {
      "name": "conv3_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 256
      }
    },
    {
      "name": "conv3_2-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 1152
      }
    },
    {
      "name": "conv3_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 128
      }
    },
    {
      "name": "conv3_1-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 256
      }
    },
    {
      "name": "conv3_2-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 784,
        "k": 1152
      }
    },
    {
      "name": "conv3_3-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 128
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 512
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 196,
        "k": 2304
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 1024,
        "j": 196,
        "k": 256
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 1024
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 4608
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 2048,
        "j": 49,
        "k": 512
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 1024
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 4608
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 2048,
        "j": 49,
        "k": 512
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 1024
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 49,
        "k": 4608
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 2048,
        "j": 49,
        "k": 512
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/resnet50_last.json
================================================
{
  "workloads": [
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/resnet50_last2.json
================================================
{
  "workloads": [
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/resnet50_original.json
================================================
{
  "workloads": [
    {
      "name": "conv1",
      "tags": [
        "conv",
        "maxpool_4"
      ],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 7,
        "q": 7
      }
    },
    {
      "name": "conv2_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv2_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 64,
        "o": 64,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2_3-2",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 64,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv3_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 128,
        "o": 128,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3_3-3",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 128,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-3",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-4",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_1-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv4_2-5",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 256,
        "o": 256,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4_3-5",
      "tags": [
        "conv",
        "maxpool_2"
      ],
      "params": {
        "i": 256,
        "o": 1024,
        "r": 14,
        "c": 14,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 1024,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-0",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-1",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_1-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 2048,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    },
    {
      "name": "conv5_2-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 512,
        "r": 7,
        "c": 7,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5_3-2",
      "tags": [
        "conv"
      ],
      "params": {
        "i": 512,
        "o": 2048,
        "r": 7,
        "c": 7,
        "p": 1,
        "q": 1
      }
    }
  ]
}

================================================
FILE: autosa_scripts/odyssey/workload/vgg16-2-img2col.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["gemm", "img2col"],
      "params": {
        "p0": 64,
        "p1": 50176,
        "p2": 27
      }
    },
    {
      "name": "conv1-2",
      "tags": ["gemm", "img2col"],
      "params": {
        "p0": 64,
        "p1": 50176,
        "p2": 576
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16-3.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["conv"],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv1-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 64,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2-1",
      "tags": ["conv"],
      "params": {
        "i": 64,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16-4.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["conv"],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv1-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 64,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2-1",
      "tags": ["conv"],
      "params": {
        "i": 64,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 128,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["conv"],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv1-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 64,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2-1",
      "tags": ["conv"],
      "params": {
        "i": 64,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv2-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 128,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3-1",
      "tags": ["conv"],
      "params": {
        "i": 128,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3-2",
      "tags": ["conv"],
      "params": {
        "i": 256,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv3-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 256,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4-1",
      "tags": ["conv"],
      "params": {
        "i": 256,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4-2",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv4-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5-1",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5-2",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    },
    {
      "name": "conv5-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_1.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": ["conv"],
      "params": {
        "i": 3,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    }  
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_10.json
================================================
{
  "workloads": [
    {
      "name": "conv4-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_11.json
================================================
{
  "workloads": [
    {
      "name": "conv5-1",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_12.json
================================================
{
  "workloads": [
    {
      "name": "conv5-2",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_13.json
================================================
{
  "workloads": [
    {
      "name": "conv5-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 14,
        "c": 14,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_2.json
================================================
{
  "workloads": [
    {
      "name": "conv1-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 64,
        "o": 64,
        "r": 224,
        "c": 224,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_3.json
================================================
{
  "workloads": [
    {
      "name": "conv2-1",
      "tags": ["conv"],
      "params": {
        "i": 64,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_4.json
================================================
{
  "workloads": [
    {
      "name": "conv2-2",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 128,
        "o": 128,
        "r": 112,
        "c": 112,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_5.json
================================================
{
  "workloads": [
    {
      "name": "conv3-1",
      "tags": ["conv"],
      "params": {
        "i": 128,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_6.json
================================================
{
  "workloads": [
    {
      "name": "conv3-2",
      "tags": ["conv"],
      "params": {
        "i": 256,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_7.json
================================================
{
  "workloads": [
    {
      "name": "conv3-3",
      "tags": ["conv", "maxpool_2"],
      "params": {
        "i": 256,
        "o": 256,
        "r": 56,
        "c": 56,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_8.json
================================================
{
  "workloads": [
    {
      "name": "conv4-1",
      "tags": ["conv"],
      "params": {
        "i": 256,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_9.json
================================================
{
  "workloads": [
    {
      "name": "conv4-2",
      "tags": ["conv"],
      "params": {
        "i": 512,
        "o": 512,
        "r": 28,
        "c": 28,
        "p": 3,
        "q": 3
      }
    }
  ]
}


================================================
FILE: autosa_scripts/odyssey/workload/vgg16_img2col.json
================================================
{
  "workloads": [
    {
      "name": "conv1-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 50176,
        "k": 27
      }
    },
    {
      "name": "conv1-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 64,
        "j": 50176,
        "k": 576
      }
    },
    {
      "name": "conv2-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 12544,
        "k": 576
      }
    },
    {
      "name": "conv2-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 128,
        "j": 12544,
        "k": 1152
      }
    },
    {
      "name": "conv3-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 1152
      }
    },
    {
      "name": "conv3-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 2304
      }
    },
    {
      "name": "conv3-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 256,
        "j": 3136,
        "k": 2304
      }
    },
    {
      "name": "conv4-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 2304
      }
    },
    {
      "name": "conv4-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 4608
      }
    },
    {
      "name": "conv4-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 784,
        "k": 4608
      }
    },
    {
      "name": "conv5-1",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 196,
        "k": 4608
      }
    },
    {
      "name": "conv5-2",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 196,
        "k": 4608
      }
    },
    {
      "name": "conv5-3",
      "tags": [
        "gemm"
      ],
      "params": {
        "i": 512,
        "j": 196,
        "k": 4608
      }
    }
  ]
}

================================================
FILE: autosa_scripts/optimizer.py
================================================
#!/usr/bin/env python3

import sys
import argparse
import re
import os
import json
import subprocess
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import joblib
import xml.etree.ElementTree as ET
import time
import multiprocessing
import random
from statistics import mean
import copy
import logging
import functools
import shutil
import datetime
from pathlib import Path

import optimizer_prune as opt_prune
import resource_model as res_model
import latency_model as lat_model

def timer(func):
    """ Print the runtime of the decorated function.

    """
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(
            f'[AutoSA-Optimizer {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] INFO: Finished function: {func.__name__} in {run_time:.4f} secs')
        return value
    return wrapper_timer

def generate_loop_candidates(loops, config, stage):
    """ Generate candidate loops

    This function samples each loop dimension given the sample numbers set in
    the config, then builds a Cartesian product of all sampled loops to generate
    all possible loop combinations to search.

    Due to the current implementation limitation, we have the following limitation
    on the loop candidates:
    - Array partitionining: the loop candidates should be left-exclusive and right-inclusive.
      This prevents generating single PEs along certain dimension which causes
      codegen breakdown.
    - Latency hiding: the loop candidates should be left-inclusive and right-exclusive.
      Similarly, making it right-exclusive to avoid possible single PE case.
    - SIMD, L2 array partitioning: both left- and right-inclusive
    Note: for both latency hiding and SIMD, if we choose tiling factor as 1, the
    corresponding stage will be skipeed in AutoSA.

    If the sample mode is set in exhausive, we will search all divisible factors of
    the loop bound.
    If the sample mode is set in log, we will generate samples of exponentials of 2.
    If the sample mode is set in linear, we will generate 'n' linear samples.
    If the sample mode is set in random, we will generate 'n' random samples.

    Parameters
    ----------
    loops: list
        A list of loop upperbounds
    config: dict
        Global configuration
    stage: str
        Optimization stage name
    """
    if stage not in [
        'space_time',
        'array_part',
        'array_part_L2',
        'latency_hiding',
        'SIMD_vectorization']:
        raise NameError(f'Stage {stage} is not defined.')

    sample_mode = config['setting'][config['mode']]['sample'][stage]['mode']
    sample_n = config['setting'][config['mode']]['sample'][stage]['n']
    sample_loop_limit = config['setting'][config['mode']]['sample'][stage]['loop_limit']

    l_inclusive = 1
    r_inclusive = 1
    if stage == 'array_part':
        l_inclusive = 0
    elif stage == 'latency_hiding':
        r_inclusive = 0

    # Sample each loop dim
    sample_list = []
    for loop in loops:
        if sample_mode == 'log':
            ub = int(
                np.floor(
                    np.log2(
                        loop if sample_loop_limit == -1 else min(loop, sample_loop_limit))))
            lb = 0
        else:
            ub = loop if sample_loop_limit == -1 else min(loop, sample_loop_limit)
            lb = 1
        if not r_inclusive:
            ub = ub - 1
        if not l_inclusive:
            lb = lb + 1
        if sample_mode == 'exhaustive':
            samples = [s for s in range(lb, ub + 1) if loop % s == 0]
        elif sample_mode == 'log':
            samples = [
                np.power(
                    2,
                    int(s)) for s in range(
                    lb,
                    ub +
                    1) if loop %
                np.power(
                    2,
                    int(s)) == 0]
        elif sample_mode == 'linear':
            samples = [s for s in range(lb, ub + 1) if loop % s == 0]
            # Uniformly sample 'n' factors
            stride = 1 if len(samples) <= sample_n else int(
                len(samples) / sample_n)
            samples = [samples[i] for i in range(0, len(samples), stride)]
        elif sample_mode == 'random':
            samples = [s for s in range(lb, ub + 1) if loop % s == 0]
            # Randomly sample 'n' factors
            if sample_n < len(samples):
                samples = random.sample(samples, sample_n)
        else:
            raise NameError(f'Sample mode {sample_mode} is not defined.')
        sample_list.append(samples)

    # Generate Cartesian product
    sample_loops = list(itertools.product(*sample_list))
    sample_loops = [list(tup) for tup in sample_loops]

    return sample_loops

def multi_process(loops, func, config):
    """ Perform multi-processing for function "func".

    Parameters
    ----------
    loops:
        A list of loop candidates.
    func:
        The function to be executed by each process.
    config: dict
        Global configuration.
    """
    num_proc = min(multiprocessing.cpu_count(),
                   config['setting'][config['mode']]['multiprocess']['n_job'])
    # Split the loops into chunks
    chunk_size = int(np.ceil(float(len(loops)) / num_proc))
    loop_chunks = [loops[i: i + min(chunk_size, len(loops) - i)]
                   for i in range(0, len(loops), chunk_size)]
    pool = multiprocessing.Pool(processes=num_proc)
    # Allocate new work spaces for each forked process
    for i in range(num_proc):
        if i == 0:
            continue
        prj_dir = config['work_dir'][:-1] + str(i)
        if os.path.exists(prj_dir):
            continue
        os.mkdir(f'{prj_dir}')
        os.mkdir(f'{prj_dir}/output')
        os.mkdir(f'{prj_dir}/output/latency_est')
        os.mkdir(f'{prj_dir}/output/resource_est')
        os.mkdir(f'{prj_dir}/output/src')
        ret = execute_sys_cmd(
            f'cp {config["work_dir"]}/autosa_config.json {prj_dir}/', config)

    config['logger'].info(f'Forking {num_proc} processes...')
    verbose = config['verbose']
    stdout = config['stdout']
    logger = config['logger']
    config['verbose'] = 0
    config['stdout'] = subprocess.DEVNULL
    config['logger'] = None
    n_designs = config['monitor']['n_designs']
    config['monitor']['n_designs'] = 0

    # Execute the function
    results = pool.starmap(func, [(loop_chunks[i], copy.deepcopy(config),
        config['work_dir'][:-1] + str(i), 1) for i in range(len(loop_chunks))])
    # Aggregate the monitor information
    for result in results:
        n_designs += result['monitor']['n_designs']
    config['monitor']['n_designs'] = n_designs

    if config['mode'] == 'search':
        # Aggregate the results
        config['search_results'] = merge_search_results(
            [result['search_results'] for result in results],
            config['setting']['search']['metric'],
            config['setting']['search']['log']['n_record'],
            config['hw_info'])

    config['verbose'] = verbose
    config['stdout'] = stdout
    config['logger'] = logger

    return

def cmp_designs(design1, design2, metric):
    """ Compare two designs.

    Parameters
    ----------
    design1: dict
        Design 1.
    design2: dict
        Design 2.
    metric: str
        Metric to evaluate the design.
    """
    if design1['found'] == False:
        return design2
    if design2['found'] == False:
        return design1

    if metric == 'latency':
        if design1['latency'] < design2['latency']:
            return design1
        else:
            return design2
        # TODO: if the latency equals, we could compare to get the design with lower resouce usage.
    elif metric == 'power':
        if design1['power'] < design2['power']:
            return design1
        else:
            return design2

def generate_sa_sizes_cmd(sa_sizes):
    """ Generate the command line argument to specify the sa_sizes.

    Concatenate each size in the sa_sizes to generate the final argument.

    Parameters
    ----------
    sa_sizes: list
        A list containing the sizes for each optimization stage.
    """
    length = len(sa_sizes)
    first = 1
    cmd = '--sa-sizes="{'
    for size in sa_sizes:
        if not first:
            cmd += ';'
        cmd += size
        first = 0

    cmd += '}"'
    return cmd


@timer
def train_resource_models_xilinx(config):
    """ Train the resource model for Xilinx program.

    This function first collects all HLS synthesized designs from the previous stage.
    These designs are grouped by kernels.
    Then, it trains a resource model for each kernel using linear regression.
    The trained models are placed in /training/resource_models/

    """
    tmp_dir = config['tmp_dir']
    config['work_dir'] = f'{tmp_dir}/optimizer/synth'
    jobs = os.listdir(config['work_dir'])
    training_samples = {}
    for job in jobs:
        job_dir = f'{config["work_dir"]}/{job}'
        kernels = os.listdir(job_dir)
        for kernel in kernels:
            kernel_dir = f'{job_dir}/{kernel}'
            designs = os.listdir(kernel_dir)
            if kernel not in training_samples:
                training_samples[kernel] = []
            for design in designs:
                design_dir = f'{kernel_dir}/{design}/output'
                training_samples[kernel].append(design_dir)
    # Train the resource model for each kernel
    work_dir = f'{tmp_dir}/optimizer/training/resource_models'
    if os.path.exists(work_dir):
        shutil.rmtree(work_dir)
    os.mkdir(work_dir)
    for kernel in training_samples:
        # Create the directory
        cur_work_dir = f'{work_dir}/{kernel}'
        os.mkdir(cur_work_dir)
        # Collect the design infos
        designs = training_samples[kernel]
        design_infos = []
        for design_dir in designs:
            design_info = res_model.extract_design_info(design_dir, 1)
            design_infos.append(design_info)
            config['logger'].info(design_dir)
        # Convert the design infos to a dataframe
        modules, fifos, df = res_model.convert_design_infos_to_df(design_infos)
        # Train the models
        config['logger'].info(f'Train the resource models for {kernel}...')
        res_model.train(df, modules, fifos, design_infos, cur_work_dir, config['logger'])

@timer
def train_latency_models_xilinx(config):
    """ Train the latency model

    Note: We will assume all loops with II = 1 and depth = 1.
    """
    return

def execute_autosa_cmd(config):
    """ Compose the AutoSA command and run.

    Parameters
    ----------
    config: dict
        Global configuration.

    Returns
    -------
    ret: int
        The command return code.
    """
    # Check if time out
    if config['monitor']['time_out_start'] != -1:
        elapsed_time = time.time() - config['monitor']['time_out_start']
        if float(elapsed_time) / 60 > config['setting']['search']['time_out']:
            return -1

    cmd = ' '.join(config['cmds'])
    #config['logger'].info(f'Execute CMD: {cmd}')
    config['logger'].debug(f'Execute CMD: {cmd}')
    p = subprocess.Popen(cmd, shell=True, stdout=config['stdout'])
    ret = p.wait()
    return ret

def execute_sys_cmd(cmd, config):
    """ Execute the system command.

    Parameters
    ----------
    cmd: str
        Command to execute.
    config: dict
        Global configuration
    """
    config['logger'].debug(f'Execute CMD: {cmd}')
    p = subprocess.Popen(cmd, shell=True, stdout=config['stdout'])
    ret = p.wait()
    return ret

def generate_autosa_cmd_str(cmds):
    """ Generate the cmd to print.    
    """
    cmd_str = ''
    is_first = True
    for cmd in cmds:
        #if cmd.find(' --tuning') != -1:
        #    cmd = cmd.replace(' --tuning', '')
        if not is_first:
            cmd_str += ' '
        cmd_str += cmd
        is_first = False

    return cmd_str

def save_design_files(config):
    """ Save the current design.

    """
    # Load the kernel id
    design_dir = f'{config["work_dir"]}/output'
    with open(f'{design_dir}/resource_est/design_info.json', 'r') as f:
        design_info = json.load(f)
    kernel_id = design_info['kernel_id']
    if not os.path.exists(f'{config["work_dir"]}/kernel{kernel_id}'):
        os.mkdir(f'{config["work_dir"]}/kernel{kernel_id}')
    prj_path = f'{config["work_dir"]}/kernel{kernel_id}'
    designs = os.listdir(prj_path)
    design_id = len(designs)
    design_path = f'{config["work_dir"]}/kernel{kernel_id}/design{design_id}'
    os.mkdir(design_path)

    # Save the cmd
    with open(design_path + '/design.info', 'w') as f:
        f.write(generate_autosa_cmd_str(config['cmds']))

    # if config['mode'] == 'search':
        # Store the estimated latency and resource info
        # TODO

    # Copy the files
    ret = execute_sys_cmd(
        f'cp -r {config["work_dir"]}/output {design_path}/',
        config)

def clear_design_files(config):
    """ Clean up the design folder files

    """
    execute_sys_cmd(f'rm {config["work_dir"]}/output/latency_est/*', config)
    execute_sys_cmd(f'rm {config["work_dir"]}/output/resource_est/*', config)
    execute_sys_cmd(f'rm {config["work_dir"]}/output/src/*', config)

def explore_design(config):
    """ Explore the final design.

    In the training mode, we will save the current design.
    Later, we will sample some designs to be synthesized for
    training the resource/latency models.
    In the search mode, we will evaluate the resource and latency of the current
    design and update the config accordingly.

    """
    tmp_dir = config['tmp_dir']
    # Update the monitor
    config['monitor']['n_designs'] += 1

    if config['mode'] == 'training':
        save_design_files(config)
        clear_design_files(config)
        return
    elif config['mode'] == 'search':
        cur_design = {
            'latency': -1,
            'resource': {},
            'power': -1,
            'cmd': generate_autosa_cmd_str(config['cmds'])
        }
        config['monitor']['last_design'] = cur_design
        design_dir = f'{config["work_dir"]}/output'
        if config['setting']['search']['metric'] == 'latency':
            #start_time = time.perf_counter()
            # Predict the latency
            latency_info = lat_model.extract_latency_info(design_dir)
            latency = lat_model.predict_design_latency(
                latency_info, config['setting']['search']['cycle_period'],
                config['search_results']['opt']['latency'])
            #runtime = time.perf_counter() - start_time
            #print(f'resource runtime: {runtime}')
            if config['search_results']['opt']['found']:
                if latency > config['search_results']['opt']['latency']:
                    clear_design_files(config)
                    return
            cur_design['latency'] = int(latency)
        elif config['setting']['search']['metric'] == 'power':
            # Predict the power
            clear_design_files(config)
            raise NotImplementedError(f'DSE for power is not supported.')

        # Predict the resource usage
        #start_time = time.perf_counter()
        design_info = res_model.extract_design_info(design_dir, 0)
        modules, fifos, df = res_model.convert_design_infos_to_df([design_info])
        kernel_id = design_info['kernel_id']
        # Resource model path
        res_model_path = f'{tmp_dir}/optimizer/training/resource_models/kernel{kernel_id}'
        res = res_model.predict_design_resource_usage(
            df, modules, fifos, design_info,
            res_model_path,
            config['setting']['search']['resource_target'])
        cur_design['resource'] = res

        if not res_model.resource_valid(res, config['hw_info'], \
            config['setting']['search']['pruning']['resource']['range'],
            config['setting']['search']['resource_target']):
            clear_design_files(config)
            return
        #runtime = time.perf_counter() - start_time
        #print(f'resource runtime: {runtime}')

        # Compare and update the search results
        config['search_results'] = update_search_results(
            config['search_results'], cur_design,
            config['setting']['search']['log']['n_record'],
            'latency', config['hw_info'])

        # For certain time interval, print out the best design found so far
        if config['setting']['search']['update_time_interval'] != -1:
            if 'update_last_time' not in config['monitor']:
                config['monitor']['update_last_time'] = time.time()
            else:
                elapsed_time = time.time() - config['monitor']['update_last_time']
                if float(elapsed_time) / 60 > config['setting']['search']['update_time_interval']:
                    # print the best results so far
                    config['logger'].info(print_best_design(config['search_results']['opt'], config['hw_info']))
                    config['monitor']['update_last_time'] = time.time()

    clear_design_files(config)
    return

def simd_loop_filter(loops, tuning):
    """ Filter out the SIMD candidate loops based on the tuning information.

    We select the legal simd loop with the highest score.
    If there is no such loop, we will set "loops" to all "1"s.
    AutoSA will not tile loops with the tiling factor as one for latency hiding or
    SIMD vectorization.
    If one such loop is found, we will set all loop bounds to 1 except the target loop.

    Parameters
    ----------
    loops: list
        upper bounds of all candidate SIMD loops
    tuning: dict
        tuning information for the SIMD stage
    """
    scores = tuning['simd']['scores']
    legal = tuning['simd']['legal']
    # Find the candidate loop with the highest score
    simd_loop_idx = -1
    max_score = -1
    for i in range(len(legal)):
        if legal[i] == 0:
            continue
        if scores[i] > max_score:
            max_score = scores[i]
            simd_loop_idx = i

    if simd_loop_idx < 0:
        filter_loops = [1 for i in range(len(loops))]
    else:
        filter_loops = [1 for i in range(len(loops))]
        filter_loops[simd_loop_idx] = loops[simd_loop_idx]

    return filter_loops


def explore_simd_vectorization(config):
    """ Explore the stage of SIMD vectorization.

    When AutoSA reaches this stage, we will have the systolic array dimension
    in the tuning information. If the pruning is enabled at this stage,
    we will first filter out the designs not satisfying the pruning requirements
    for the PE structures. (SIMD_vectorization_PE_pruning)
    Next, we will limit the candidate loop upperbounds by examining the scores and
    legality information in the tuning info. Only the upperbound for the legal loop
    with the maximal score is kept, and all the rest is set to 1. (simd_loop_filter)
    After the above steps, we will go through the standard precedurs as to generate
    the candidate loops, compile the program, and move forward to the next stage.

    """
    pruning_en = config['setting'][config['mode']]['pruning']['SIMD_vectorization']['enable']
    if config['autosa_config']['simd']['mode'] == 'manual':
        with open(f'{config["work_dir"]}/output/tuning.json') as f:
            tuning = json.load(f)
        if 'simd' not in tuning:
            # No SIMD opportunities found, we will skip this stage
            explore_design(config)
        else:    
            PE_pruning_postpone = 0
            if pruning_en:                
                # Perform early pruning based on the PE numbers
                config['tuning'] = tuning
                if 'sa_dims' in config['tuning']['simd']:
                    #print(PE_pruning_postpone)          
                    if opt_prune.SIMD_vectorization_PE_pruning(config):
                        return
                else:
                    PE_pruning_postpone = 1
            #print(PE_pruning_postpone)                    
            loops = tuning['simd']['tilable_loops']
            # Filter the SIMD loops
            loops = simd_loop_filter(loops, tuning)
            loops_pool = generate_loop_candidates(
                loops, config, "SIMD_vectorization")

            if len(loops_pool) == 0:
                simd_en = config['autosa_config']['simd']['enable']
                sa_sizes = config['sa_sizes'].copy()
                config['autosa_config']['simd']['enable'] = 0
                with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                    json.dump(config['autosa_config'], f, indent=4)

                ret = execute_autosa_cmd(config)
                if ret != 0:
                    config['logger'].error(f'CMD failed with error code {ret}')
                    config['autosa_config']['simd']['enable'] = simd_en
                    config['sa_sizes'] = sa_sizes
                    return
                if PE_pruning_postpone:
                    with open(f'{config["work_dir"]}/output/tuning.json') as f:
                        tuning = json.load(f)              
                    config['tuning'] = tuning  
                    if opt_prune.SIMD_vectorization_PE_pruning(config, 1):
                        config['autosa_config']['simd']['enable'] = simd_en
                        config['sa_sizes'] = sa_sizes
                        return
                explore_design(config)
                config['autosa_config']['simd']['enable'] = simd_en
                config['sa_sizes'] = sa_sizes
                with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                    json.dump(config['autosa_config'], f, indent=4)
            else:
                if config['mode'] == 'search' and config['setting']['search']['metric'] == 'latency' \
                    and pruning_en:
                    loops_pool = opt_prune.reorder_simd_loops(loops_pool)
                for loop in loops_pool:
                    sa_sizes = config['sa_sizes'].copy()
                    config['sa_sizes'].append(
                        f'kernel[]->simd{str(loop).replace(" ", "")}')
                    config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])

                    #start_time = time.perf_counter()
                    ret = execute_autosa_cmd(config)
                    #run_time = time.perf_counter() - start_time
                    #print(f'runtime: {run_time}')

                    if ret != 0:
                        config['logger'].error(f'CMD failed with error code {ret}')
                        config['sa_sizes'] = sa_sizes
                        continue
                    if PE_pruning_postpone:
                        with open(f'{config["work_dir"]}/output/tuning.json') as f:
                            tuning = json.load(f)              
                        config['tuning'] = tuning  
                        if opt_prune.SIMD_vectorization_PE_pruning(config, 1):                            
                            config['sa_sizes'] = sa_sizes
                            continue

                    explore_design(config)
                    config['sa_sizes'] = sa_sizes

                    if config['mode'] == 'search' and config['setting']['search']['metric'] == 'latency' \
                        and pruning_en:
                        if opt_prune.SIMD_vectorization_latency_pruning(config):
                            return
    else:
        explore_design(config)

    return


def explore_latency_hiding(config):
    """ Explore the stage of latency hiding.


    """
    if config['autosa_config']['latency']['mode'] == 'manual':
        # Fetch the tuning info
        with open(f'{config["work_dir"]}/output/tuning.json') as f:
            tuning = json.load(f)
        if 'latency' not in tuning:
            # This stage is skippd by AutoSA, we will also skip it
            latency_hiding_en = config['autosa_config']['latency']['enable']
            sa_sizes = config['sa_sizes'].copy()
            config['autosa_config']['latency']['enable'] = 0
            with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)
            ret = execute_autosa_cmd(config)
            if ret != 0:
                config['logger'].error(f'CMD failed with error code {ret}')
                config['autosa_config']['latency']['enable'] = latency_hiding_en
                config['sa_sizes'] = sa_sizes
                return
            explore_simd_vectorization(config)

            config['autosa_config']['latency']['enable'] = latency_hiding_en
            config['sa_sizes'] = sa_sizes
            with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)
            return

        loops = tuning['latency']['tilable_loops']        
        loops_pool = generate_loop_candidates(loops, config, "latency_hiding")
        if config['setting'][config['mode']
                             ]['pruning']['latency_hiding']['enable']:
            config['tuning'] = tuning
            loops_pool = opt_prune.latency_hiding_loops_pruning(
                loops_pool, config)

        if len(loops_pool) == 0:
            # Latency hiding is a must. In this case, we will stop exploration and return.
            return
        else:
            for loop in loops_pool:
                # Hack: For GEMM4
                #loop[-1] = 1

                sa_sizes = config['sa_sizes'].copy()
                config['sa_sizes'].append(
                    f'kernel[]->latency{str(loop).replace(" ", "")}')
                config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])
                ret = execute_autosa_cmd(config)
                if ret != 0:
                    config['logger'].error(f'CMD failed with error code {ret}')
                    config['sa_sizes'] = sa_sizes
                    continue
                explore_simd_vectorization(config)
                config['sa_sizes'] = sa_sizes
    else:
        explore_simd_vectorization(config)

    return


def explore_array_part_L2(config):
    """ Explore the stage of second-level array partitioning.

    """
    if config['autosa_config']['array_part_L2']['mode'] == 'manual':
        # Fetch the tuning info
        with open(f'{config["work_dir"]}/output/tuning.json') as f:
            tuning = json.load(f)
        loops = tuning['array_part_L2']['tilable_loops']
        coincident = tuning['array_part_L2']['coincident']
        # Generate the tiling factors to proceed
        loops_pool = generate_loop_candidates(loops, config, 'array_part_L2')
        if config['setting'][config['mode']
                             ]['pruning']['array_part_L2']['enable']:
            config['tuning'] = tuning
            loops_pool = opt_prune.array_part_L2_loops_pruning(
                loops_pool, config)

        if len(loops_pool) == 0:
            # No available tiling options, we will disable this step and skip
            # it.
            array_part_L2_en = config['autosa_config']['array_part_L2']['enable']
            sa_sizes = config['sa_sizes'].copy()
            config['autosa_config']['array_part_L2']['enable'] = 0
            with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)

            ret = execute_autosa_cmd(config)
            if ret != 0:
                config['logger'].error(f'CMD failed with error code {ret}')
                config['autosa_config']['array_part_L2']['enable'] = array_part_L2_en
                config['sa_sizes'] = sa_sizes
                return
            explore_latency_hiding(config)
            # Revert the changes
            config['autosa_config']['array_part_L2']['enable'] = array_part_L2_en
            config['sa_sizes'] = sa_sizes
            with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)
        else:
            for loop in loops_pool:
                sa_sizes = config['sa_sizes'].copy()
                config['sa_sizes'].append(
                    f'kernel[]->array_part_L2{str(loop).replace(" ", "")}')
                config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])
                ret = execute_autosa_cmd(config)
                if ret != 0:
                    config['logger'].error(f'CMD failed with error code {ret}')
                    config['sa_sizes'] = sa_sizes
                    continue
                explore_latency_hiding(config)
                config['sa_sizes'] = sa_sizes
    else:
        explore_latency_hiding(config)


def explore_array_part_single_job(loops, config, work_dir, is_multi_process=0):
    """ Explore the stage of array partitioning with single process.

    Parameters
    ----------
    loops:
        Candidate loops.
    config:
        Global configuration.
    work_dir: str
        The current work directory.
    is_multi_process: int
        Is multi process launched.
    """
    # Modify the commands
    config['cmds'][1] = f'--config={work_dir}/autosa_config.json'
    config['cmds'][2] = f'--output-dir={work_dir}/output'
    config['work_dir'] = work_dir
    config['logger'] = logging.getLogger('AutoSA-Optimizer')

    # Progress meter
    total_tasks = len(loops)
    finished_tasks = 0
    for loop in loops:
        sa_sizes = config['sa_sizes'].copy()
        config['sa_sizes'].append(
            f'kernel[]->array_part{str(loop).replace(" ", "")}')
        config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])
        ret = execute_autosa_cmd(config)
        if ret != 0:
            config['logger'].error(f'CMD failed with error code {ret}')
            config['sa_sizes'] = sa_sizes
            continue
        if config['two_level_buffer']:
            explore_array_part_L2(config)
        else:
            explore_latency_hiding(config)
        config['sa_sizes'] = sa_sizes
        finished_tasks += 1
        config['logger'].info(f'Progress(PID: {os.getpid()}): [{finished_tasks}/{total_tasks}]')

    if is_multi_process:
        config['logger'] = None
    return config


def explore_array_part(config):
    """ Explore the stage of array partitioning.

    If this stage is set in Manual mode, this function will load the tuning
    info which contains all the tilable loops.
    This function will then generate all possible loop tiling combination.
    If stage pruning is enabled, these loop candidates will be pruned
    based on certain heuristics.
    Next, this function will iterate through these combinations and proceed to
    the next stage.
    If multi-processing is enabled, the optimizer folder directory will
    be updated to allocate a workspace for each forked process.
    We will distribute these loops equally to all the processes to proceed.

    Otherwise, we will skip this stage and jump to the next stage.
    As for the next stage, we will go to:
    - array_part_L2 if config['two_level_buffer'] is enabled
    - latency_hiding if config['two_level_buffer'] is disabled

    We apply the following heuristic to prune the candidate loops.
    - The product of tiling factors should be no less than the #PE lower bound.

    Parameters
    ----------
    config: dict
        Global configuration.
    """
    if config['autosa_config']['array_part']['mode'] == 'manual':
        # The program will terminate after array partitioning
        # Fetch the tuning info
        with open(f'{config["work_dir"]}/output/tuning.json') as f:
            tuning = json.load(f)
        loops = tuning['array_part']['tilable_loops']
        # Generate the tiling factors to proceed
        loops_pool = generate_loop_candidates(loops, config, 'array_part')
        if config['setting'][config['mode']
                             ]['pruning']['array_part']['enable']:
            # Apply pruning on the candidate loops
            loops_pool = opt_prune.array_part_loops_pruning(loops_pool, config)

        if len(loops_pool) == 0:
            # No available tiling options, we will disable this step and skip it.
            # At the same time, two-level-buffer is also disabled
            array_part_en = config['autosa_config']['array_part']['enable']
            array_part_L2_en = config['autosa_config']['array_part_L2']['enable']
            sa_sizes = config['sa_sizes'].copy()
            config['autosa_config']['array_part']['enable'] = 0
            config['autosa_config']['array_part_L2']['enable'] = 0
            with open(f'config["work_dir"]/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)

            ret = execute_autosa_cmd(config)
            if ret != 0:
                config['logger'].error(f'CMD failed with error code {ret}')
                config['autosa_config']['array_part']['enable'] = array_part_en
                config['autosa_config']['array_part_L2']['enable'] = array_part_L2_en
                config['sa_sizes'] = sa_sizes
                return
            explore_latency_hiding(config)
            # Revert the changes
            config['autosa_config']['array_part']['enable'] = array_part_en
            config['autosa_config']['array_part_L2']['enable'] = array_part_L2_en
            config['sa_sizes'] = sa_sizes
            with open(f'config["work_dir"]/autosa_config.json', 'w') as f:
                json.dump(config['autosa_config'], f, indent=4)
        else:
            if config['setting'][config['mode']]['multiprocess']['n_job'] > 1 and len(loops_pool) > 1:
                multi_process(
                    loops_pool,
                    explore_array_part_single_job,
                    config)
            else:
                explore_array_part_single_job(
                    loops_pool, config, config['work_dir'])
    else:
        if config['autosa_config']['array_part_L2']['enable']:
            explore_array_part_L2(config)
        else:
            explore_latency_hiding(config)


def explore_space_time(config):
    """ Explore the stage of space-time transformation.

    If this stage is set in Manual mode, we will load the tuning info
    and iterate through all possible kernels to proceed.
    Otherwise, AutoSA automatically selects one kernel to proceed.
    We will directly jump to the next stage: array partitioning.

    Parameters
    ----------
    config: dict
        Global configuration.
    """
    if config['autosa_config']['space_time']['mode'] == 'manual':
        # The program will terminate after the space-time transformation
        # Fetch the tuning info
        with open(f'{config["work_dir"]}/output/tuning.json') as f:
            tuning = json.load(f)
        if 'space_time' not in tuning:
            # Users have assigned the space-time options, we will skip this stage
            explore_array_part(config)
        else:
            n_kernel = tuning['space_time']['n_kernel']

            # Iterate through different kernels
            #for kernel_id in [0]:
            for kernel_id in range(n_kernel):
                config['logger'].info(f'Search kernel {kernel_id}...')
                sa_sizes = config['sa_sizes'].copy()
                config['sa_sizes'].append(f'kernel[]->space_time[{kernel_id}]')
                config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])
                ret = execute_autosa_cmd(config)
                if ret != 0:
                    config['logger'].error(f'CMD failed with error code {ret}')
                    config['sa_sizes'] = sa_sizes
                    continue
                explore_array_part(config)
                config['sa_sizes'] = sa_sizes
    else:
        explore_array_part(config)


@timer
def explore_design_space(config):
    """ Explore the design space through multiple stages

    We will expand the design space through multiple stages:
    space-time transformation ->
    array partitioning ->
    latency hiding ->
    SIMD vectorization

    At each stage, we will generate a new cmd and execute it to obtain the tuning
    information for the next stage.
    The cmd list:
    - config['cmds'][0]: the original user command
    - config['cmds'][1]: the AutoSA config file
    - config['cmds'][2]: the AutoSA output directory
    - config['cmds'][3]: the AutoSA sizes

    Parameters
    ----------
    config: dict
        Global configuration.
    """
    # Execute the cmd
    config['cmds'][3] = generate_sa_sizes_cmd(config['sa_sizes'])
    ret = execute_autosa_cmd(config)
    if ret != 0:
        config['logger'].error(f'CMD failed with error code {ret}')
        config['sa_sizes'] = []
        return
    # Enter the first stage: space-time transformation
    explore_space_time(config)

def synth_train_samples_single_job(config, job_id):
    """ Launch HLS synthesis for each single process

    """
    config['logger'] = logging.getLogger('AutoSA-Optimizer')
    autosa_prj_path = os.environ['AUTOSA_ROOT']
    work_dir = f'{config["work_dir"]}/job{job_id}'
    kernels = os.listdir(work_dir)
    for kernel in kernels:
        path = f'{work_dir}/{kernel}'
        designs = os.listdir(path)
        for design in designs:
            prj_path = f'{path}/{design}/output'
            # Copy the HLS TCL script to the project
            ret = execute_sys_cmd(
                f'cp {autosa_prj_path}/autosa_scripts/hls_scripts/hls_script_synth.tcl {prj_path}/hls_script.tcl',
                config)
            # Execute the TCL
            cwd = os.getcwd()
            os.chdir(prj_path)
            ret = execute_sys_cmd('vivado_hls -f hls_script.tcl', config)
            os.chdir(cwd)

@timer
def generate_train_samples(config):
    """ Generate the training samples.

    """
    # Prepare the directory and files
    tmp_dir = config['tmp_dir']
    if os.path.exists(f'{tmp_dir}/optimizer/training'):
        shutil.rmtree(f'{tmp_dir}/optimizer/training')
    os.mkdir(f'{tmp_dir}/optimizer/training')
    os.mkdir(f'{tmp_dir}/optimizer/training/job0')
    # Initialize file directory
    Path(f'{config["work_dir"]}/output').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/src').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/latency_est').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/resource_est').mkdir(exist_ok=True)
    with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
        json.dump(config['autosa_config'], f, indent=4)

    while config['monitor']['n_designs'] < config['setting']['synth']['sample']['n']:
        # Collect enough training samples
        explore_design_space(config)
    config['logger'].info(f'{config["monitor"]["n_designs"]} designs are generated.')

@timer
def synth_train_samples(config):
    """ Synthesize the trainig samples.

    We will sample a few designs generated from the previous training exploration.
    Next, we call Vivado HLS to synthesize each design.

    """
    tmp_dir = config['tmp_dir']
    config['work_dir'] = f'{tmp_dir}/optimizer/training'
    # Collect all designs into a list
    design_paths = {}
    for n in range(config['setting']['training']['multiprocess']['n_job']):
        f_path = f'{config["work_dir"]}/job{n}'
        f_list = os.listdir(f_path)
        for f in f_list:
            if 'kernel' in f:
                if f not in design_paths:
                    design_paths[f] = []
                d_path = f'{f_path}/{f}'
                d_list = os.listdir(d_path)
                for d in d_list:
                    prj_path = f'{d_path}/{d}'
                    design_paths[f].append(prj_path)
    # Random sample a few designs for each kernel and build the synthesis folder
    config['work_dir'] = f'{tmp_dir}/optimizer/synth'
    if os.path.exists(config['work_dir']):
        shutil.rmtree(config['work_dir'])
    os.mkdir(config['work_dir'])
    num_proc = min(multiprocessing.cpu_count(),
                   config['setting']['synth']['multiprocess']['n_job'])
    for i in range(num_proc):
        prj_dir = config['work_dir'] + f'/job{i}'
        os.mkdir(prj_dir)
    tasks = []
    for kernel in design_paths:
        designs = design_paths[kernel]
        n_sample = config['setting']['synth']['sample']['n']
        if n_sample < len(designs):
            designs = random.sample(designs, n_sample)
        # Push to the list
        for design in designs:
            tasks.append((kernel, design))
    # Uniformly distribute the tasks to each processor
    chunk_size = int(np.ceil(float(len(tasks)) / num_proc))
    task_chunks = [tasks[i: i + min(chunk_size, len(tasks) - i)]
                   for i in range(0, len(tasks), chunk_size)]
    for job_id in range(len(task_chunks)):
        task_chunk = task_chunks[job_id]
        for task in task_chunk:
            kernel = task[0]
            design_path = task[1]
            design = design_path.rsplit('/', 1)[-1]
            if not os.path.exists(
                    f'{config["work_dir"]}/job{job_id}/{kernel}'):
                os.mkdir(f'{config["work_dir"]}/job{job_id}/{kernel}')
            new_design_path = f'{config["work_dir"]}/job{job_id}/{kernel}/{design}'
            # copy the design files
            ret = execute_sys_cmd(
                f'cp -r {design_path} {new_design_path}', config)

    # Execute the HLS synthesis
    pool = multiprocessing.Pool(processes=num_proc)
    config['logger'].info(f'Launch HLS synthesis with {num_proc} processes...')
    logger = config['logger']
    config['logger'] = None
    ret = pool.starmap(
        synth_train_samples_single_job, [
            (config, i) for i in range(num_proc)])
    config['logger'] = logger


def train_xilinx(config):
    """ Train the resource and latency models on Xilinx platforms.

    This function first creates training samples by randomly sampling all
    the design points.
    Then it calls Vivado HLS to synthesize all designs.
    Next it collects the results and trains the resource and latency models
    using linear regression.

    Parameters
    ----------
    config: dict
        Global configuration.
    """
    config['mode'] = 'training'

    # Generate sample designs
    config['logger'].info('Generate training samples...')
    generate_train_samples(config)

    # Synthesize designs
    config['logger'].info('Synthesize training samples...')
    synth_train_samples(config)

    # Train the resource models
    config['logger'].info('Train resource models...')
    train_resource_models_xilinx(config)

    ## Train the latency models
    # config['logger'].info('Train latency models...')
    # train_latency_models_xilinx(config) # TODO

def get_default_pruning_policy(mode):
    """ Return the default search pruning policy.

    """
    #TODO
    return

def get_sample_policy(mode, n_random=2):
    """ Return the search sampling policy.

    Parameters
    ----------
    mode: str
        Sampling mode.
    n_random: int
        The higher the random level, the more samples are generated.
    """
    if mode == 'random':
        ret = {
            "array_part": {
                "mode": "random",
                "n": n_random,
                "loop_limit": -1
            },
            "array_part_L2": {
                "mode": "random",
                "n": n_random,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "random",
                "n": n_random,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "random",
                "n": n_random,
                "loop_limit": 8
            }
        }
    elif mode == 'exhaustive':
        ret = {
            "array_part": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "array_part_L2": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": -1
            },
            "latency_hiding": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 64
            },
            "SIMD_vectorization": {
                "mode": "exhaustive",
                "n": -1,
                "loop_limit": 8
            }
        }
    else:
        raise RuntimeError(f'Unknown sampling mode: {mode}')

    return ret

def print_best_design(opt_design, hw_info=None):
    """ Pretty print the best design.

    Parameters
    ----------
    opt_design: dict
        Optimal design.

    Returns
    -------
    ret: str
        Printed design in a string.
    """
    ret = (
        f"\n======== Best design ========\n"
        f"Latency(Cycle): {int(opt_design['latency'])}\n"
        f"Power(W): {opt_design['power']}\n"
        f"Resource:\n"
    )

    if 'FF' in opt_design['resource']:
        ret += f"\tFF: {int(opt_design['resource']['FF'])}"
        if hw_info:
            ratio = float(opt_design['resource']['FF']) / hw_info['FF']
            ret += f" ({ratio:.2f})"
        ret += "\n"
    if 'LUT' in opt_design['resource']:
        ret += f"\tLUT: {int(opt_design['resource']['LUT'])}"
        if hw_info:
            ratio = float(opt_design['resource']['LUT']) / hw_info['LUT']
            ret += f" ({ratio:.2f})"
        ret += "\n"
    if 'BRAM18K' in opt_design['resource']:
        ret += f"\tBRAM18K: {int(opt_design['resource']['BRAM18K'])}"
        if hw_info:
            ratio = float(opt_design['resource']['BRAM18K']) / hw_info['BRAM18K']
            ret += f" ({ratio:.2f})"
        ret += "\n"
    if 'URAM' in opt_design['resource']:
        ret += f"\tURAM: {int(opt_design['resource']['URAM'])}"
        if hw_info:
            ratio = float(opt_design['resource']['URAM']) / hw_info['URAM']
            ret += f" ({ratio:.2f})"
        ret += "\n"
    if 'DSP' in opt_design['resource']:
        ret += f"\tDSP: {int(opt_design['resource']['DSP'])}"
        if hw_info:
            ratio = float(opt_design['resource']['DSP']) / hw_info['DSP']
            ret += f" ({ratio:.2f})"
        ret += "\n"
    ret += f"============================="

    return ret

def save_search_log(records, log):
    """ Save the DSE design records to log file.

    Parameters
    ----------
    records: list
        A list of best designs found in the tuning process.
    log: str
        Path to the log file.
    """
    with open(log, 'w') as f:
        json.dump(records, f, indent=4)

def search_xilinx(config):
    """ Perform search phase on Xilinx platform.

    """
    # Prepare the directory and files
    tmp_dir = config['tmp_dir']
    if os.path.exists(f'{tmp_dir}/optimizer/search'):
        shutil.rmtree(f'{tmp_dir}/optimizer/search')
    os.mkdir(f'{tmp_dir}/optimizer/search')
    os.mkdir(f'{tmp_dir}/optimizer/search/job0')
    # Initialize file directory
    Path(f'{config["work_dir"]}/output').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/src').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/latency_est').mkdir(exist_ok=True)
    Path(f'{config["work_dir"]}/output/resource_est').mkdir(exist_ok=True)
    with open(f'{config["work_dir"]}/autosa_config.json', 'w') as f:
        json.dump(config['autosa_config'], f, indent=4)

    config['mode'] = 'search'
    config['search_results'] = init_search_results()
    # Modify the command
    #config['cmds'][0] += ' --tuning'

    if config['setting'][config['mode']]['pruning']['random_start']['enable']:
        # Random search the design space
        config['search_results'] = init_search_results()
        # Update the sampling strategy
        user_policy = copy.deepcopy(config['setting'][config['mode']]['sample'])
        config['setting'][config['mode']]['sample'] = get_sample_policy('random',
            config['setting'][config['mode']]['pruning']['random_start']['n_random'])
        n_trial = 0
        while n_trial < config['setting'][config['mode']]['pruning']['random_start']['n_trial']:
            config['logger'].info(f'Run random search to warm up... [{n_trial + 1}/{config["setting"][config["mode"]]["pruning"]["random_start"]["n_trial"]}]')
            explore_design_space(config)
            config['logger'].info(print_best_design(config['search_results']['opt'], config['hw_info']))
            n_trial += 1
        config['setting'][config['mode']]['sample'] = user_policy

    config['logger'].info('Start searching...')
    # Set up the time-out counter
    if config['setting']['search']['time_out'] != -1:
        config['monitor']['time_out_start'] = time.time()
    if config['setting'][config['mode']]['mode'] == 'exhaustive':
        config['logger'].info('Search mode: Exhaustive')
        config['setting'][config['mode']]['sample'] = \
            get_sample_policy(config['setting'][config['mode']]['mode'])
        explore_design_space(config)
    elif config['setting'][config['mode']]['mode'] == 'random':
        config['logger'].info('Search mode: Random')
        config['setting'][config['mode']]['sample'] = \
            get_sample_policy(config['setting'][config['mode']]['mode'],
                config['setting'][config['mode']]['n_random'])
        explore_design_space(config)
    elif config['setting'][config['mode']]['mode'] == 'customized':
        config['logger'].info('Search mode: Customized')
        explore_design_space(config)

    #print(config['monitor']['n_designs'])

    # Print out the best design
    config['logger'].info(print_best_design(config['search_results']['opt'], config['hw_info']))
    # Store the tuning log
    tmp_dir = config['tmp_dir']
    log_path = f'{tmp_dir}/optimizer/search/DSE.log'
    config['logger'].info(f'Saving the DSE results to: {log_path}')
    save_search_log(config['search_results']['records'], log_path)

    return


def init_logger(training, search, verbose, tmp_dir):
    """ Init AutoSA logger.

    Initialize the AutoSA logger.

    Parameters
    ----------
    training: boolean
        Enable training phase.
    search: boolean
        Enable search phase.
    verbose: int
        Logger verbose level.
        0: Print minimal information from Optimizer.
        1: Print all information from Optimizer.
        2: Print information from Optimizer and AutoSA.
    tmp_dir: str
        Path to the temp files.

    Returns
    -------
    logger:
        AutoSA logger.
    """
    logger = logging.getLogger('AutoSA-Optimizer')
    formatter = logging.Formatter(
        '[%(name)s %(asctime)s] %(levelname)s: %(message)s',
        '%Y-%m-%d %H:%M:%S')
    logger.setLevel(logging.INFO)

    s_handler = logging.StreamHandler()
    if training:
        f_handler = logging.FileHandler(
            f'{tmp_dir}/optimizer/training.log', 'w')
    elif search:
        f_handler = logging.FileHandler(f'{tmp_dir}/optimizer/search.log', 'w')
    if verbose > 1:
        s_handler.setLevel(level=logging.DEBUG)
        f_handler.setLevel(level=logging.DEBUG)
    elif verbose == 1:
        s_handler.setLevel(level=logging.INFO)
        f_handler.setLevel(level=logging.INFO)
    else:
        s_handler.setLevel(level=logging.WARNING)
        f_handler.setLevel(level=logging.WARNING)

    s_handler.setFormatter(formatter)
    f_handler.setFormatter(formatter)
    logger.addHandler(s_handler)
    logger.addHandler(f_handler)

    return logger


def init_monitor():
    """ Init monitor for DSE.

    Returns
    -------
    monitor: dict
        "n_designs": number of designs that are examined
        "time_out_start": the starting time for time-out counter
    """
    monitor = {"n_designs": 0, "time_out_start": -1}

    return monitor

def init_search_results():
    """ Init search results for DSE.

    Note: The search results contain two parts: the opt design and the tuning
    records. The opt design is the best design found during the search process.
    The records contain the top designs found during the search process.

    """
    ret = {
        'opt': {
            'found': False,
            'latency': -1,
            'resource': {'FF': -1, 'LUT': -1, 'BRAM18K': -1, 'URAM': -1, 'DSP': -1},
            'power': -1,
            'cmd': None
        },
        'records': []
    }

    return ret

def update_search_results(results, cur_design, n_record, metric, hw_info):
    """ Update the search results.

    Parameters
    ----------
    results: dict
        A dict containing the current search results.
    cur_design: dict
        The current design to be compared.
    n_record: int
        The number of records to be logged in the search results.
    metric: str
        Evaluation metric.
    hw_info: dict
        A dictionary containing the hardware information.
    """
    if metric == 'latency':
        update_design = False
        if not results['opt']['found']:
            results['opt']['found'] = True
            update_design = True
        else:
            update_design = False
            if cur_design['latency'] < results['opt']['latency']:
                update_design = True
            elif cur_design['latency'] == results['opt']['latency']:
                # We compute a score for the resource usage.
                cur_res_score = res_model.compute_res_util_score(cur_design['resource'], hw_info)
                opt_res_score = res_model.compute_res_util_score(results['opt']['resource'], hw_info)
                if cur_res_score < opt_res_score:
                    update_design = True

        if update_design:
            # Update the opt
            results['opt']['latency'] = cur_design['latency']
            results['opt']['resource'] = cur_design['resource']
            results['opt']['cmd'] = cur_design['cmd']
            # Update the records
            results['records'].insert(0, results['opt'].copy())
            results['records'] = results['records'][:n_record]
    else:
        raise NotImplementedError(f'Update search results for power is not supported.')

    return results

def merge_search_results(results, metric, n_record, hw_info):
    """ Merge search results from DSE.

    We will first merge the records and then update the opt design.
    Each result is already sorted. Therefore, we will initialize the return list
    with the first result. For the following results, we will insert them into the
    return list by comparing the metrics.

    Parameters
    ----------
    results: list
        A list of results to merge.
    metric: str
        The DSE evaluation metric.
    n_record: int
        Number of top records to keep.
    hw_info: dict
        Hardware information.

    Returns
    -------
    ret: dict
        A dict containing the merged search results
    """
    ret = init_search_results()
    if metric == 'latency':
        is_first = 1
        for result in results:
            if len(result['records']) == 0:
                continue

            if is_first == 1:
                ret = result
                is_first = 0
            else:
                records = result['records']
                for record in records:
                    inserted = False
                    for cmp_id in range(len(ret['records'])):
                        cmp_record = ret['records'][cmp_id]
                        # Check if it is a duplicate record
                        if record['cmd'] == cmp_record['cmd']:
                            inserted = True
                            break

                        if record['latency'] < cmp_record['latency']:
                            ret['records'].insert(cmp_id, record)
                            inserted = True
                            break
                        elif record['latency'] == cmp_record['latency']:
                            cur_res_score = res_model.compute_res_util_score(record['resource'], hw_info)
                            cmp_res_score = res_model.compute_res_util_score(cmp_record['resource'], hw_info)
                            if cur_res_score < cmp_res_score:
                                ret['records'].insert(cmp_id, record)
                                inserted = True
                                break
                            elif cur_res_score == cmp_res_score:
                                # Duplicated
                                inserted = True
                                break

                    if inserted == False:
                        ret['records'].append(record)

                ret['opt'] = ret['records'][0]
                ret['records'] = ret['records'][:n_record]

        return ret
    else:
        raise NotImplementedError(f'Merge results for metric {metric} is not supported.')

def init_config(setting, verbose, hw_info, cmd, training, search, tmp_dir):
    """ Init AutoSA Optimizer global configuration.

    Init the global configuration used in Optimizer.

    Parameters
    ----------
    setting: dict
        AutoSA Optimizer setting.
    verbose: int
        Print verbose level.
    tmp_dir: str
        Path to the temporary files.

    Note
    ----
    Configuration is a dictionary containing the following info:
      setting: dict
        AutoSA Optimizer setting.
      verbose: int
        Print verbose level.
      stdout:
        Stdout pipe.
      work_dir: str
        The default working directory.
      hw_info: dict
        The hardware configuration.
      logger:
        The default logger.
      cmds: list
        A list of AutoSA commands.
          [0]: The user input command.
          [1]: AutoSA configuration file.
          [2]: AutoSA output directory.
          [3]: AutoSA sizes.
      sa_sizes: list
        A list of AutoSA tiling factors.
      two_level_buffer: boolean
        Is two_level_buffer enabled.
      hbm: boolean
        Is HBM enabled.
      kernel_file_path: str
        Input kernel file path.
      simd_info: dict
        Kernel SIMD information.
      tuning: dict
        Temporary tuning information from AutoSA.
      monitor: dict
        A dictionary storing the monitoring information of the DSE
          "n_designs": number of designs that are examined

    Returns
    -------
    config: dict
        Initialized global configuration.
    """
    config = {}
    config['setting'] = setting
    config['verbose'] = verbose
    config['tmp_dir'] = tmp_dir
    if verbose == 2:
        # Print AutoSA info
        config['stdout'] = None
    else:
        config['stdout'] = subprocess.DEVNULL
    if training:
        config['work_dir'] = f'{tmp_dir}/optimizer/training/job0'
    else:
        config['work_dir'] = f'{tmp_dir}/optimizer/search/job0'
    with open(hw_info) as f:
        config['hw_info'] = json.load(f)
    config['cmds'] = [cmd]
    config['cmds'].append(
        f'--autosa-config={config["work_dir"]}/autosa_config.json')
    config['cmds'].append(f'--autosa-output-dir={config["work_dir"]}/output')
    config['cmds'].append('')
    config['sa_sizes'] = []
    # Look up if sa_sizes are pre-set in the cmd
    if config['cmds'][0].find('sa-sizes') != -1:
        m = re.search(r'--sa-sizes="{(.+?)}"', config['cmds'][0])
        if m:
            for size in m.group(1).split(';'):
                config['sa_sizes'].append(size)
            # delete the sa_sizes from the cmd
            config['cmds'][0] = re.sub(r'--sa-sizes=".+?"', '', config['cmds'][0])
    if cmd.find('two-level-buffer') != -1:
        config['two_level_buffer'] = 1
    else:
        config['two_level_buffer'] = 0
    if cmd.find('hbm') != -1:
        config['hbm'] = 1
    else:
        config['hbm'] = 0
    # Load SIMD info file
    kernel_file_path = cmd.split()[1]
    kernel_file_path = kernel_file_path.rsplit('/', 1)[0]
    config['kernel_file_path'] = kernel_file_path
    config['simd_info'] = None
    with open(kernel_file_path + '/simd_info.json', 'r') as f:
        config['simd_info'] = json.load(f)

    return config


def xilinx_run(cmd, hw_info, setting, training, search, verbose, tmp_dir):
    """ Design space exploration on Xilinx platform.

    The following four stages are explored in the DSE:
    - space-time transformation
    - array partitioning
    - latnecy hiding
    - simd vectorization

    The DSE includes two phaese: training phase and search phase
    In the tranining phase, for each systolic array candidate, we generate
    a set of tuning parameters for the later three stages. This step
    creates a suite of designs. We will use training samples to train the
    regression models for the latency and resource usage of the design.

    After the training stage is done, we enter the search phase.
    In this phase, for each systolic array, we will explore all different
    tiling factors in the later three stages. After the tuning parameters
    of each stage is determined, we estimate the latency and resource usage
    of the design using the pre-trained regression model.
    Finally, the design with the least latency and under the resource contraints
    is selected.

    Folder structure:
    autosa.tmp
    - optimizer
      - [training.log | search.log]
      - training
        - job0
          - autosa_config.json
          - output
            - src
            - latency_est
            - resource_est
          - design0
          - design1
      - search
        - job0
        - job1

    Paramters
    ---------
    cmd: str
        Command line to run AutoSA.
    info: str
        Path to FPGA platform hardware resource information file.
    setting: dict
        Optimizer settings.
    training: boolean
        Enable traning phase.
    search: boolean
        Enable search phase.
    verbose: int
        Print verbose information.
    tmp_dir: str
        Path to the folder that stores the temp files.
    """

    if not os.path.exists(f'{tmp_dir}/optimizer'):
        os.mkdir(f'{tmp_dir}/optimizer')

    # Init logger and optimizer config
    logger = init_logger(training, search, verbose, tmp_dir)
    config = init_config(setting, verbose, hw_info, cmd, training, search, tmp_dir)
    config['logger'] = logger
    # Init monitor
    config['monitor'] = init_monitor()

    # Init the AutoSA configuration
    autosa_config = {"space_time": {"mode": "manual"},
                     "array_part": {"enable": 1, "mode": "manual"},
                     "array_part_L2": {
        "enable": config['two_level_buffer'],
        "mode": "manual"},
        "latency": {"enable": 1, "mode": "manual"},
        "simd": {"enable": 1, "mode": "manual"},
        "hbm": {"enable": config['hbm'], "mode": "manual"}}
    config['autosa_config'] = autosa_config

    # Training phase
    if training:
        config['logger'].info(f'Run training phase...')
        train_xilinx(config)

    # Search phase
    if search:
        config['logger'].info(f'Run search phase...')
        search_xilinx(config)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='==== AutoSA Optimizer ====')
    parser.add_argument(
        '-c',
        '--cmd',
        metavar='CMD',
        required=True,
        help='AutoSA command line')
    parser.add_argument(
        '-i',
        '--info',
        metavar='INFO',
        required=True,
        help='hardware resource information')
    parser.add_argument(
        '-s',
        '--setting',
        metavar='SETTING',
        required=False,
        default='autosa_config/optimizer_settings.json',
        help='optimizer settings')
    parser.add_argument(
        '-p',
        '--platform',
        metavar='PLATFORM',
        required=True,
        help='hardware platform: intel/xilinx')
    parser.add_argument(
        '--training',
        action='store_true',
        help='run training phase')
    parser.add_argument(
        '--search',
        action='store_true',
        help='run search phase')
    parser.add_argument(
        '--verbose',
        type=int,
        required=False,
        default=1,
        help='provide verbose information [0-2]')
    parser.add_argument(
        '--tmp-dir',
        required=False,
        default='./autosa.tmp',
        help='temporary file directory')

    args = parser.parse_args()

    # Parse the settings into a dict
    with open(args.setting) as f:
        setting = json.load(f)

    if args.platform == 'intel':
        print("Intel platform is not supported yet!")  # TODO
    elif args.platform == 'xilinx':
        xilinx_run(
            args.cmd,
            args.info,
            setting,
            args.training,
            args.search,
            args.verbose,
            args.tmp_dir)


================================================
FILE: autosa_scripts/optimizer_prune.py
================================================
#!/usr/bin/env python3

def array_part_loops_pruning(loops, config):
    """ Apply pruning on array partitioning candidate loops.

    At present, we apply the following heuristics:
    - The product of all array_part loops should be greater than the total PE number  
    - TODO: Prune based on off-chip traffic

    Parameters
    ----------
    loops: list
        A list of candidate loops
    config:
        Global configuration
    """
    pruned_loops = []

    PE_lb = config['setting'][config['mode']
                              ]['pruning']['array_part']['PE_num'][0]
    for loop in loops:
        if PE_lb == -1:
            pruned_loops.append(loop)
        else:
            prod = 1
            for l in loop:
                if l > 1:
                    prod *= l
            if prod < PE_lb:
                continue
            pruned_loops.append(loop)

    return pruned_loops


def array_part_L2_loops_pruning(loops, config):
    """ Apply pruning on L2 array partitioning candidate loops.

    At present, we wpply the following heuristics:
    - We only apply L2 array partitioning on parallel loops to save off-chip communication.
      We examine from outer loops to inner loops. Once we meet a non-parallel loop,
      we will stop from here, and set the tiling factors from here to below to maximum.

    Parameters
    ----------
    loops: list
        A list of candidate loops
    config:
        Global configuration  
    """
    pruned_loops = []
    tuning = config['tuning']
    loop_stop = 0
    for c in tuning['array_part_L2']['coincident']:
        if not c:
            break
        loop_stop += 1
    ubs = tuning['array_part_L2']['tilable_loops'][loop_stop:]
    for loop in loops:
        # Examine [loop_stop:-1], only leave those that equal the upper bound
        loop_cut = loop[loop_stop:]
        if loop_cut != ubs:
            continue
        pruned_loops.append(loop)

    return pruned_loops


def latency_hiding_loops_pruning(loops, config):
    """ Apply pruning on latency hiding candidate loops.

    At present, we apply the following heuristics:
    - We compute the latency hiding register sizes and prune it when it is 
      greater or less than the pre-set threshold.

    Parameters
    ----------
    loops: list
        A list of candidate loops
    config:
        Global configuration
    """
    pruned_loops = []
    reg_size_lb = config['setting'][config['mode']
                                    ]['pruning']['latency_hiding']['reg_size'][0]
    reg_size_ub = config['setting'][config['mode']
                                    ]['pruning']['latency_hiding']['reg_size'][1]
    for loop in loops:
        size = 1
        for l in loop:
            size *= l
        if reg_size_lb != -1:
            if size < reg_size_lb:
                continue
        if reg_size_ub != -1:
            if size > reg_size_ub:
                continue
        pruned_loops.append(loop)

    return pruned_loops


def SIMD_vectorization_PE_pruning(config, postpone=0):
    """ Apply pruning based on the PE structures at the SIMD vectorization stage.

    At present, we apply the following heuristics:
    - We restrain the PE number within certain range
    - We restrain the PE shape for 2D array

    Parameters
    ----------
    config: dict
        Global configuration
    postpone: int
        If the pruning is postponed after the SIMD optimization

    Returns
    -------
    ret: boolean
        If this configuration is to be pruned.
    """
    tuning = config['tuning']
    ret = False
    PE_num_lb = config['setting'][config['mode']
                                  ]['pruning']['SIMD_vectorization']['PE_num'][0]
    PE_num_ub = config['setting'][config['mode']
                                  ]['pruning']['SIMD_vectorization']['PE_num'][1]
    if postpone == 0:
        sa_dims = tuning['simd']['sa_dims']
    else:
        sa_dims = tuning['sa_dims']

    n_pe = 1
    for dim in sa_dims:
        n_pe *= int(dim)
    if PE_num_lb != -1:
        if n_pe < PE_num_lb:
            return True
    if PE_num_ub != -1:
        if n_pe > PE_num_ub:
            return True
    
    if len(sa_dims) > 1:
        sa_dims.sort(reverse=True)
        pe_ratio = sa_dims[0] / sa_dims[1]
        if config['setting'][config['mode']]['pruning']['SIMD_vectorization']['PE_ratio'] != -1:
            if pe_ratio > config['setting'][config['mode']]['pruning']['SIMD_vectorization']['PE_ratio']:
                return True

    return ret


def reorder_simd_loops(loops):
    """ Reorder the simd loops for pruning.

    The input loops contains a list of candidate loops. 
    For each candidate loop, it is in the format of [1, 1, X].
    We will sort the loops based on the non-one element in descending order.    

    Parameters
    ----------
    loops: list
        A list containing all candidate SIMD loops to be evaluated.
    """
    # Find the position of the non-one element.
    pos = -1
    for loop in loops:
        for i in range(len(loop)):
            if loop[i] != 1:
                pos = i
                break
        if pos != -1:
            break

    if pos == -1:
        # All the loops are ones.
        return loops

    loops.sort(key=lambda x: x[pos], reverse=True)
    return loops


def SIMD_vectorization_latency_pruning(config):
    """ Perform latency-based pruning at the SIMD vectorization stage.

    We have already reordered the SIMD candidate loops in descending order.
    Therefore, if the last design evaluated is slower than the opt design found
    so far, there is no chance for the rest of candidates which has a smaller 
    SIMD factor to beat the opt design. 
    We will stop exploration for these loops and return.
    Otherwise, if the resource usage is legal, we have already found a design that 
    achieves the least latency in the current group. For the other designs with 
    a smaller SIMD factor, their latency is no less than the current design.
    We will stop exploration for these loops and return.
    However, there a chance that the designs with a smaller SIMD factor acheives 
    the same latency but with less resource usage (for a comm bound design). 
    At present, we ignore such cases.

    """
    last_design = config['monitor']['last_design']
    if last_design['latency'] == -1:
        # The current design is already slower than opt., stop exploration.
        return True
    else:
        # The current design is resource-legal, stop exploration.
        if not last_design['resource']:
            return True
    return False


================================================
FILE: autosa_scripts/pe_group.py
================================================
#!/usr/bin/env python3

import sympy
import sys
import argparse
import re
import numpy as np

def locate_data_trans_block(line_id, lines):
    prev_line_id = line_id - 1
    while prev_line_id >= 0:
        prev_line = lines[prev_line_id]
        if prev_line.find('{') != -1:
            block_start = prev_line_id
            break
        prev_line_id -= 1
    nxt_line_id = line_id + 1
    while nxt_line_id < len(lines):
        nxt_line = lines[nxt_line_id]
        if nxt_line.find('}') != -1:
            block_end = nxt_line_id
            break
        nxt_line_id += 1

    return block_start, block_end

def modify_index(lines, var_map, PE_dims):
    #print(var_map)

    new_lines = []
    for line in lines:
        for var in var_map:
            new_var = var
            for dim_idx in range(len(PE_dims)):
                new_var += f'[s{dim_idx}]'
            line = re.sub(rf'{var}', f'{new_var}', line)
            if line.find(var) != -1 and var_map[var]['simd'] == 1:
                # TODO: Consider the index only appears once
                pos = line.find(var)
                end_pos = pos
                for p in range(pos, len(line)):
                    if line[p] == ' ' or line[p] == ')':
                        end_pos = p - 1
                        break
                #print(pos)
                #print(end_pos)
                ref = line[pos : end_pos + 1]
                #print(ref)
                index = ref[ref.find('['):]
                indices = []
                while len(index) > 0:
                    start_pos = index.find('[')
                    end_pos = index.find(']')
                    indices.append(index[start_pos:end_pos+1])
                    index = index[end_pos + 1:]
                #print(index)
                #print(indices)
                last_index = indices[-1][1:-1]
                new_ref = ref[:ref.find('[')]
                for index in indices[:-1]:
                    new_ref += index
                new_ref += '.data['
                new_ref += last_index
                new_ref += ']'
                #print(ref)
                #print(new_ref)
                line = line.replace(ref, new_ref)

        new_lines.append(line)

    return new_lines

def insert_data_trans(lines, data_trans_info, PE_dims):    
    for group_name in data_trans_info:
        info = data_trans_info[group_name]
        #print(group_name)
        #print(data_trans_info[group_name]['PE_index_start'])
        #print(data_trans_info[group_name]['PE_index_end'])
        dir = [info['PE_index_end'][dim] - info['PE_index_start'][dim] for dim in range(len(info['PE_index_start']))]
        #print(dir)
        if dir == [0, 1]:
            new_lines = [\
                '#pragma unroll\n',
                f'for (int s0 = 0; s0 < {PE_dims[0]}; s0++) {{\n',
                f'  local_{group_name}[s0][0][0] = read_channel_intel(fifo_{group_name}_PE[s0][0]);\n',
                '}\n'
                '#pragma unroll\n',
                f'for (int s0 = 0; s0 < {PE_dims[0]}; s0++) {{\n',
                '  #pragma unroll\n',
                f'  for (int s1 = 1; s1 < {PE_dims[1]}; s1++) {{\n',
                f'    local_{group_name}[s0][s1][0] = __fpga_reg(__fpga_reg(local_{group_name}[s0][0][0]));\n'
                '  }\n'
                '}\n'
            ]            
        elif dir == [1, 0]:
            new_lines = [\
                '#pragma unroll\n',
                f'for (int s1 = 0; s1 < {PE_dims[1]}; s1++) {{\n',
                f'  local_{group_name}[0][s1][0] = read_channel_intel(fifo_{group_name}_PE[0][s1]);\n',
                '}\n'
                '#pragma unroll\n',
                f'for (int s0 = 1; s0 < {PE_dims[0]}; s0++) {{\n',
                '  #pragma unroll\n',
                f'  for (int s1 = 0; s1 < {PE_dims[1]}; s1++) {{\n',
                f'    local_{group_name}[s0][s1][0] = __fpga_reg(__fpga_reg(local_{group_name}[0][s1][0]));\n'
                '  }\n'
                '}\n'
            ]            
        else:
            raise NotImplementedError('Unsupport Direction')
        lines = new_lines + lines

    return lines

def modify_channels(lines, data_trans_info, PE_dims):
    # In the channel declaration, delete all the fifo_{group}_PE
    new_lines = []
    drain_groups = []
    for line in lines:
        find = False
        for group in data_trans_info:
            if line.find('/* PE fifo */') != -1 and line.find(f'fifo_{group}_PE') != -1:
                find = True
        if line.find('/* PE fifo */') != -1 and line.find(f'_PE_') != -1 and line.find('drain') != -1:
            m = re.search(r'fifo_(.+?)_PE', line)
            drain_group = m.group(1)
            if drain_group not in drain_groups:
                drain_groups.append(drain_group)
            find = True
        if not find:
            new_lines.append(line)    

    for line_id in range(len(lines)):
        line = lines[line_id]
        if line.find('/* Channel Declaration */') != -1:
            channel_decl_start = line_id
            break
    for group in data_trans_info:
        info = data_trans_info[group]
        dir = [info['PE_index_end'][dim] - info['PE_index_start'][dim] for dim in range(len(info['PE_index_start']))]
        if dir == [0, 1]:
            line = f'/* PE fifo */ channel {info["data_type"]} fifo_{group}_PE[{PE_dims[0]}][1] __attribute__((depth(2)));\n'
        elif dir == [1, 0]:
            line = f'/* PE fifo */ channel {info["data_type"]} fifo_{group}_PE[1][{PE_dims[1]}] __attribute__((depth(2)));\n'
        else:
            raise NotImplementedError('Unsupport Direction')
        new_lines.insert(channel_decl_start + 1, line)
    for group in drain_groups:
        line = f'/* PE fifo */ channel float fifo_{group}_PE[{PE_dims[0]}][{PE_dims[1]}] __attribute__((depth(2)));\n'
        new_lines.insert(channel_decl_start + 1, line)

    # Replace all channel calls
    for group in data_trans_info:
        fifo_prefix = f'fifo_{group}_PE_'
        for line_id in range(len(new_lines)):
            line = new_lines[line_id]
            if line.find(fifo_prefix) != -1:
                modify = False
                if line.find('write_channel_intel') != -1:
                    m = re.search(r'\((.+?)\)', line)
                    fifo_name = m.group(1).split(',')[0]                    
                    modify = True
                elif line.find('read_channel_intel') != -1:
                    m = re.search(r'\((.+?)\)', line)
                    fifo_name = m.group(1)
                    modify = True
                if modify:                    
                    #print(fifo_name)
                    index = fifo_name[len(fifo_prefix):].split('_')
                    new_fifo_name = fifo_prefix[:-1]
                    for ind in index:
                        new_fifo_name += f'[{ind}]'
                    #print(new_fifo_name)
                    line = line.replace(fifo_name, new_fifo_name)
                    new_lines[line_id] = line

    #print(lines)
    #print(drain_groups)
    for group in drain_groups:
        fifo_prefix = f'fifo_{group}_PE_'
        for line_id in range(len(new_lines)):
            line = new_lines[line_id]
            if line.find(fifo_prefix) != -1:
                modify = False
                if line.find('write_channel_intel') != -1:
                    m = re.search(r'\((.+?)\)', line)
                    fifo_name = m.group(1).split(',')[0]                    
                    modify = True         
                elif line.find('read_channel_intel') != -1:
                    m = re.search(r'\((.+?)\)', line)
                    fifo_name = m.group(1)
                    modify = True                           
                if modify:       
                    # Check if inside a PE definition
                    inside_PE = False
                    prev_line_id = line_id - 1                                        
                    while prev_line_id >= 0:                        
                        prev_line = new_lines[prev_line_id]                                                             
                        if prev_line.find('/* Module') != -1:
                            break
                        if prev_line.find('void PE') != -1:
                            inside_PE = True
                            break      
                        prev_line_id -= 1                                                      
                    #print(inside_PE)                        
                    #print(fifo_prefix)
                    #print(fifo_name)
                    index = fifo_name[len(fifo_prefix):].split('_')
                    new_fifo_name = fifo_prefix[:-1]
                    if inside_PE:
                        for i in range(len(PE_dims)):
                            new_fifo_name += f'[s{i}]'
                    else:
                        for ind in index:                        
                            new_fifo_name += f'[{ind}]'
                    #print(new_fifo_name)
                    line = line.replace(fifo_name, new_fifo_name)
                    new_lines[line_id] = line

    # Delete all dummy functions
    module_start = False
    delete_module = False
    delete_pos = []
    for line_id in range(len(new_lines)):
        line = new_lines[line_id]
        if line.find('/* Module Definition */') != -1:
            module_start = not module_start
            if module_start:
                module_start_pos = line_id
                delete_module = False
            if not module_start:
                module_end_pos = line_id
                if delete_module:
                    delete_pos.append([module_start_pos, module_end_pos])
            if module_start:
                nxt_line = new_lines[line_id + 3]            
                if nxt_line.find('dummy') != -1:
                    delete_module = True
    offset = 0
    for p in delete_pos:
        new_lines = new_lines[:p[0] - offset] + new_lines[p[1] + 1 - offset:]
        offset += (p[1] - p[0] + 1)                

    return new_lines

def modify_body(lines, PE_dims, var_map):
    """
    This function modifies the PE body.
    For the user statement, it is wrapped with unrolled space loops
    For the data transfer statements, they are replaced with two loop blocks,
    one for initializing the boundary, the other for reusing the data.
    """    
    loop_bodies = []
    # Locate the user statements
    for line_id in range(len(lines)):
        line = lines[line_id]
        if line.find('hls_pipeline') != -1:
            # extract the loop body
            body_start = line_id
            r_minus_l = -1
            nxt_line_id = line_id + 1            
            while nxt_line_id < len(lines):
                nxt_line = lines[nxt_line_id]
                if nxt_line.find('}') != -1:
                    r_minus_l += 1
                if nxt_line.find('{') != -1:
                    r_minus_l -= 1
                if r_minus_l == 0:
                    body_end = nxt_line_id - 1
                    break
                nxt_line_id += 1
            loop_body = lines[body_start : body_end + 1]
            #print(loop_body)
            loop_bodies.append({'pos': [body_start, body_end], 'lines': loop_body})
    
    # Modidy the loop bodies
    #for body in loop_bodies:
    body_offset = 0
    for idx in range(len(loop_bodies)):
        body = loop_bodies[idx]
        body_lines = body['lines']        
        group_names = []
        has_data_trans = True
        data_trans_info = extract_data_trans_info(body_lines, PE_dims)
        # Remove the in transfer
        while has_data_trans:
            has_data_trans = False
            for line_id in range(len(body_lines)):
                line = body_lines[line_id]
                if line.find('read_channel_intel') != -1:
                    has_data_trans = True
                    # Locate the read block and the write block
                    block_start, block_end = locate_data_trans_block(line_id, body_lines)
                    m = re.search(r'\((.+?)\)', line)    
                    fifo_name = m.group(1)
                    group_name = fifo_name.split('_')[1]
                    group_names.append(group_name)
                    break
            if has_data_trans:
                body_lines = body_lines[:block_start] + body_lines[block_end + 1:]
        # Remove the out transfer
        has_data_trans = True
        while has_data_trans:
            has_data_trans = False
            for line_id in range(len(body_lines)):
                line = body_lines[line_id]
                if line.find('write_channel_intel') != -1:
                    m = re.search(r'\((.+?)\)', line)
                    fifo_name = m.group(1).split(',')[0]
                    group_name = fifo_name.split('_')[1]
                    if group_name in group_names:
                        has_data_trans = True
                        block_start, block_end = locate_data_trans_block(line_id, body_lines)
            if has_data_trans:
                body_lines = body_lines[:block_start] + body_lines[block_end + 1:]
        #print(body_lines)
        # Wrap the body with space loops
        for dim_idx in range(len(PE_dims)):
            dim = PE_dims[dim_idx]            
            line = f'#pragma unroll\nfor (int s{dim_idx} = 0; s{dim_idx} < {dim}; s{dim_idx}++) {{\n'
            body_lines.insert(dim_idx, line)                        
        for dim in PE_dims:
            body_lines.append('}\n')

        # Modify the index
        body_lines = modify_index(body_lines, var_map, PE_dims)
        #print(body_lines)

        # Insert the data transfer stmts
        body_lines = insert_data_trans(body_lines, data_trans_info, PE_dims)
        #loop_bodies[idx]['lines'] = body_lines

        # Replace the loop bodies
        body_pos = body['pos']        
        lines = lines[: body_offset + body_pos[0]] \
                + body_lines \
                + lines[body_offset + body_pos[1] + 1 :]   
        body_offset += len(body_lines) - (body_pos[1] - body_pos[0] + 1)

    return lines

def extract_data_trans_info(lines, PE_dims):
    """ Extract the data transfer information 

    """
    data_trans_info = {}
    for line_id in range(len(lines)):
        line = lines[line_id]
        if line.find('read_channel_intel') != -1:
            # Check the start and end of the block
            block_start, block_end = locate_data_trans_block(line_id, lines)            
            block_lines = lines[block_start : block_end + 1]
            # Parse the data type
            block_line = block_lines[1]
            data_type = block_line.strip().split(' ')[0]
            #print(data_type)
            # Parse the start PE index
            block_line = block_lines[2]
            m = re.search(r'\((.+?)\)', block_line)
            fifo_name = m.group(1)
            PE_index_start = fifo_name.split('_')[-len(PE_dims):]
            PE_index_start = [int(s) for s in PE_index_start]
            #print(PE_index_start)
            # Parse the IO group name
            group_name = fifo_name.split('_')[1]
            #print(group_name)
            data_trans_info[group_name] = {\
                'in_block_lines': block_lines, 'in_block_pos': [block_start, block_end], \
                'PE_index_start': PE_index_start, 'data_type': data_type}
        if line.find('write_channel_intel') != -1:
            m = re.search(r'\((.+?)\)', line)
            fifo_name = m.group(1).split(',')[0]
            group_name = fifo_name.split('_')[1]
            if group_name in data_trans_info:                
                # Check the start and end of the block
                block_start, block_end = locate_data_trans_block(line_id, lines)
                block_lines = lines[block_start : block_end + 1]
                # Parse the end PE index
                block_line = block_lines[3]
                m = re.search(r'\((.+?)\)', block_line)
                fifo_name = m.group(1).split(',')[0]
                PE_index_end = fifo_name.split('_')[-len(PE_dims):]
                PE_index_end = [int(s) for s in PE_index_end]
                #print(PE_index_end)
                group_name = fifo_name.split('_')[1]
                data_trans_info[group_name]['PE_index_end'] = PE_index_end
                data_trans_info[group_name]['out_block_lines'] = block_lines
                data_trans_info[group_name]['out_block_pos'] = [block_start, block_end]

    return data_trans_info

def compose_PE(data_trans_info, PE_dims, PE_defs):
    PE_def = PE_defs[0]
    # Extract the variable declariton and main body */
    PE_lines = PE_def['def']
    var_start = False
    var_end = False
    var_lines = []
    body_lines = []
    for line_id in range(len(PE_lines)):
        line = PE_lines[line_id]
        if line.find('Variable Declaration') != -1:
            var_start = not var_start
            if not var_start:
                var_end = True
            continue
        if var_start:
            var_lines.append(line)
        if var_end:
            body_lines.append(line)
    var_lines = var_lines[1:] # Remove the module id
    body_lines = body_lines[:-2] # Remove the function end bracket

    lines = []
    lines.append('/* Module Definition */\n')
    lines.append('__attribute__((max_global_work_dim(0)))\n')
    lines.append('__attribute__((autorun))\n')
    lines.append('kernel void PE()\n')
    lines.append('{\n')

    var_map = {}
    # Print the variable definitions 
    lines.append('  /* Variable Declaration */\n')
    for var_line in var_lines:
        simd = 0
        m = re.search(r'local_(.+?)\[', var_line)
        group_name = m.group(1)
        data_type = var_line.strip().split(' ')[0]
        index = var_line[var_line.find('['):var_line.find(';')]
        indices = []
        while len(index) > 0:
            start_pos = index.find('[')
            end_pos = index.find(']')
            indices.append(index[start_pos:end_pos+1])
            index = index[end_pos + 1:]
        #print(group_name, data_type, indices)            
        for dim in PE_dims:
            index = f'[{dim}]'
            indices.insert(0, index)
        if group_name in data_trans_info:
            if data_trans_info[group_name]['data_type'] != data_type:
                # SIMD > 1
                simd = 1
                data_type = data_trans_info[group_name]['data_type']
                indices = indices[:-1]
        #print(group_name, data_type, indices)            
        new_index = ''
        for index in indices:
            new_index += index
        new_var_line = f'  {data_type} local_{group_name}{new_index};'
        #print(new_var_line)      
        var_map[f'local_{group_name}'] = {'simd': simd}
        lines.append(new_var_line + '\n')
        
    lines.append('  /* Variable Declaration */\n')

    # Print the body
    new_body_lines = modify_body(body_lines, PE_dims, var_map)
    for line in new_body_lines:
        lines.append(line)

    lines.append('}\n')
    lines.append('/* Module Definition */\n')

    return lines

def run(input_f, output_f):
    """ Group PEs into a Monolithic Function

    This funciton is only used for the following case:
    - Intel OpenCL
    - The systolic array should be an output-stationary rectangular array
    We will first collect the array dims and the data transfer direction for each IO group.
    Next we will generate a new monolithic function of PE:
    - Variable declaration: 
      - Remove module ids
      - Extend all the local arrays with array dimensions. 
        - If the array is an external array, we will repack the array with the SIMD factor
    - For each statement, add the space loops with unroll pragma      
    """
    with open(input_f) as f:
        lines = f.readlines()

    # Collect the array dims
    PE_defs = []
    module_start = False
    is_PE = False    
    PE_indices = []
    for line_id in range(len(lines)):
        line = lines[line_id]
        if line.find('Module Definition') != -1:
            module_start = not module_start
            if module_start:
                module_start_pos = line_id
                is_PE = False
            else:
                module_end_pos = line_id
                if is_PE:
                    PE_defs.append({'def': lines[module_start_pos : module_end_pos + 1], \
                                    'pos': [module_start_pos, module_end_pos]})
            if module_start:
                #print(line_id)
                nxt_line_id = line_id + 1
                while nxt_line_id < len(lines):                    
                    nxt_line = lines[nxt_line_id]
                    if nxt_line.find('kernel void PE') != -1:
                        is_PE = True
                        m = re.search(r'void PE(.+?)\(', nxt_line)
                        #print(nxt_line)
                        if m:
                            PE_index = m.group(1).split('_')[1:]
                            PE_indices.append(PE_index)
                        if is_PE:
                            break
                    if nxt_line.find('Module Definition') != -1:
                        break
                    nxt_line_id += 1

    #print(PE_indices)
    PE_dims = [int(d) for d in PE_indices[0]]
    for ind in PE_indices:
        for dim in range(len(PE_dims)):
            PE_dims[dim] = max(PE_dims[dim], int(ind[dim]) + 1)
    #print(PE_dims)
    
    PE_lines = PE_defs[0]['def']
    # Parse the data transfer information
    data_trans_info = extract_data_trans_info(PE_lines, PE_dims)    

    # Compose the new PE function
    PE_lines = compose_PE(data_trans_info, PE_dims, PE_defs)

    line_offset = 0
    for PE_def in PE_defs:
        lines = lines[:PE_def['pos'][0] - line_offset] + lines[PE_def['pos'][1] + 1 - line_offset:]
        line_offset += (PE_def['pos'][1] - PE_def['pos'][0] + 1)

    lines = lines + PE_lines

    # Modify the channels
    lines = modify_channels(lines, data_trans_info, PE_dims)

    with open(output_f, 'w') as f:
        for line in lines:
            f.write(line)
    #    f.writelines(PE_lines)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Group PEs into a Monolithic Function')
    parser.add_argument('-i', required=True, help='input kernel function')
    parser.add_argument('-o', required=True, help='output kernel function')

    args = parser.parse_args()
    run(args.i, args.o)

================================================
FILE: autosa_scripts/ppcg_changes/isl/ast_type.h
================================================
#ifndef ISL_AST_TYPE_H
#define ISL_AST_TYPE_H

#include <isl/list.h>

#if defined(__cplusplus)
extern "C" {
#endif

/* AutoSA Extended */
enum autosa_loop_type {
	autosa_loop_error = -1,
  autosa_loop_default = 0,
  autosa_loop_time,
  autosa_loop_space,
  autosa_loop_latency,
  autosa_loop_simd,
  autosa_loop_array_part	
};
/* AutoSA Extended */

struct __isl_export isl_ast_expr;
typedef struct isl_ast_expr isl_ast_expr;

struct __isl_export isl_ast_node;
typedef struct isl_ast_node isl_ast_node;

enum isl_ast_expr_op_type {
	isl_ast_expr_op_error = -1,
	isl_ast_expr_op_and,
	isl_ast_expr_op_and_then,
	isl_ast_expr_op_or,
	isl_ast_expr_op_or_else,
	isl_ast_expr_op_max,
	isl_ast_expr_op_min,
	isl_ast_expr_op_minus,
	isl_ast_expr_op_add,
	isl_ast_expr_op_sub,
	isl_ast_expr_op_mul,
	isl_ast_expr_op_div,
	isl_ast_expr_op_fdiv_q,	/* Round towards -infty */
	isl_ast_expr_op_pdiv_q,	/* Dividend is non-negative */
	isl_ast_expr_op_pdiv_r,	/* Dividend is non-negative */
	isl_ast_expr_op_zdiv_r,	/* Result only compared against zero */
	isl_ast_expr_op_cond,
	isl_ast_expr_op_select,
	isl_ast_expr_op_eq,
	isl_ast_expr_op_le,
	isl_ast_expr_op_lt,
	isl_ast_expr_op_ge,
	isl_ast_expr_op_gt,
	isl_ast_expr_op_call,
	isl_ast_expr_op_access,
	isl_ast_expr_op_member,
	isl_ast_expr_op_address_of
};

#define isl_ast_op_type		isl_ast_expr_op_type
#define isl_ast_op_error	isl_ast_expr_op_error
#define isl_ast_op_and		isl_ast_expr_op_and
#define isl_ast_op_and_then	isl_ast_expr_op_and_then
#define isl_ast_op_or		isl_ast_expr_op_or
#define isl_ast_op_or_else	isl_ast_expr_op_or_else
#define isl_ast_op_max		isl_ast_expr_op_max
#define isl_ast_op_min		isl_ast_expr_op_min
#define isl_ast_op_minus	isl_ast_expr_op_minus
#define isl_ast_op_add		isl_ast_expr_op_add
#define isl_ast_op_sub		isl_ast_expr_op_sub
#define isl_ast_op_mul		isl_ast_expr_op_mul
#define isl_ast_op_div		isl_ast_expr_op_div
#define isl_ast_op_fdiv_q	isl_ast_expr_op_fdiv_q
#define isl_ast_op_pdiv_q	isl_ast_expr_op_pdiv_q
#define isl_ast_op_pdiv_r	isl_ast_expr_op_pdiv_r
#define isl_ast_op_zdiv_r	isl_ast_expr_op_zdiv_r
#define isl_ast_op_cond		isl_ast_expr_op_cond
#define isl_ast_op_select	isl_ast_expr_op_select
#define isl_ast_op_eq		isl_ast_expr_op_eq
#define isl_ast_op_le		isl_ast_expr_op_le
#define isl_ast_op_lt		isl_ast_expr_op_lt
#define isl_ast_op_ge		isl_ast_expr_op_ge
#define isl_ast_op_gt		isl_ast_expr_op_gt
#define isl_ast_op_call		isl_ast_expr_op_call
#define isl_ast_op_access	isl_ast_expr_op_access
#define isl_ast_op_member	isl_ast_expr_op_member
#define isl_ast_op_address_of	isl_ast_expr_op_address_of

enum isl_ast_expr_type {
	isl_ast_expr_error = -1,
	isl_ast_expr_op,
	isl_ast_expr_id,
	isl_ast_expr_int
};

enum isl_ast_node_type {
	isl_ast_node_error = -1,
	isl_ast_node_for = 1,
	isl_ast_node_if,
	isl_ast_node_block,
	isl_ast_node_mark,
	isl_ast_node_user
};

enum isl_ast_loop_type {
	isl_ast_loop_error = -1,
	isl_ast_loop_default = 0,
	isl_ast_loop_atomic,
	isl_ast_loop_unroll,
	isl_ast_loop_separate
};

struct isl_ast_print_options;
typedef struct isl_ast_print_options isl_ast_print_options;

ISL_DECLARE_LIST(ast_expr)
ISL_DECLARE_EXPORTED_LIST(ast_node)

#if defined(__cplusplus)
}
#endif

#endif


================================================
FILE: autosa_scripts/ppcg_changes/isl/files.txt
================================================
include/isl/schedule_node.h
include/isl/ast_type.h
include/isl/schedule.h
isl_schedule_tree.c
isl_schedule_tree.h
isl_schedule_node.c
isl_schedule_band.c
isl_schedule_band.h
isl_schedule.c


================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_patch.sh
================================================
#!/bin/sh
cp ast_type.h ../../../src/isl/include/isl/
cp schedule_node.h ../../../src/isl/include/isl/
cp schedule.h ../../../src/isl/include/isl/
cp vec.h ../../../src/isl/include/isl/
cp isl_schedule_tree.c ../../../src/isl/
cp isl_schedule_tree.h ../../../src/isl/
cp isl_schedule_node.c ../../../src/isl/
cp isl_schedule_band.c ../../../src/isl/
cp isl_schedule_band.h ../../../src/isl/
cp isl_schedule.c ../../../src/isl/


================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule.c
================================================
/*
 * Copyright 2011      INRIA Saclay
 * Copyright 2012-2014 Ecole Normale Superieure
 * Copyright 2016      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <isl/ctx.h>
#include <isl/val.h>
#include <isl_aff_private.h>
#include <isl/map.h>
#include <isl/set.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <isl_sort.h>
#include <isl/printer.h>
#include <isl_schedule_private.h>
#include <isl_schedule_tree.h>
#include <isl_schedule_node_private.h>

/* Return a schedule encapsulating the given schedule tree.
 *
 * We currently only allow schedule trees with a domain or extension as root.
 *
 * The leaf field is initialized as a leaf node so that it can be
 * used to represent leaves in the constructed schedule.
 * The reference count is set to -1 since the isl_schedule_tree
 * should never be freed.  It is up to the (internal) users of
 * these leaves to ensure that they are only used while the schedule
 * is still alive.
 */
__isl_give isl_schedule *isl_schedule_from_schedule_tree(isl_ctx *ctx,
	__isl_take isl_schedule_tree *tree)
{
	enum isl_schedule_node_type type;
	isl_schedule *schedule;

	if (!tree)
		return NULL;
	type = isl_schedule_tree_get_type(tree);
	if (type != isl_schedule_node_domain &&
	    type != isl_schedule_node_extension)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_unsupported,
			"root of schedule tree should be a domain or extension",
			goto error);

	schedule = isl_calloc_type(ctx, isl_schedule);
	if (!schedule)
		goto error;

	schedule->ref = 1;
	schedule->root = tree;
	schedule->leaf = isl_schedule_tree_leaf(ctx);

	if (!schedule->leaf)
		return isl_schedule_free(schedule);
	return schedule;
error:
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Return a pointer to a schedule with as single node
 * a domain node with the given domain.
 */
__isl_give isl_schedule *isl_schedule_from_domain(
	__isl_take isl_union_set *domain)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	ctx = isl_union_set_get_ctx(domain);
	tree = isl_schedule_tree_from_domain(domain);
	return isl_schedule_from_schedule_tree(ctx, tree);
}

/* Return a pointer to a schedule with as single node
 * a domain node with an empty domain.
 */
__isl_give isl_schedule *isl_schedule_empty(__isl_take isl_space *space)
{
	return isl_schedule_from_domain(isl_union_set_empty(space));
}

/* Return a new reference to "sched".
 */
__isl_give isl_schedule *isl_schedule_copy(__isl_keep isl_schedule *sched)
{
	if (!sched)
		return NULL;

	sched->ref++;
	return sched;
}

/* Return an isl_schedule that is equal to "schedule" and that has only
 * a single reference.
 */
__isl_give isl_schedule *isl_schedule_cow(__isl_take isl_schedule *schedule)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!schedule)
		return NULL;
	if (schedule->ref == 1)
		return schedule;

	ctx = isl_schedule_get_ctx(schedule);
	schedule->ref--;
	tree = isl_schedule_tree_copy(schedule->root);
	return isl_schedule_from_schedule_tree(ctx, tree);
}

__isl_null isl_schedule *isl_schedule_free(__isl_take isl_schedule *sched)
{
	if (!sched)
		return NULL;

	if (--sched->ref > 0)
		return NULL;

	isl_schedule_tree_free(sched->root);
	isl_schedule_tree_free(sched->leaf);
	free(sched);
	return NULL;
}

/* Replace the root of "schedule" by "tree".
 */
__isl_give isl_schedule *isl_schedule_set_root(
	__isl_take isl_schedule *schedule, __isl_take isl_schedule_tree *tree)
{
	if (!schedule || !tree)
		goto error;
	if (schedule->root == tree) {
		isl_schedule_tree_free(tree);
		return schedule;
	}

	schedule = isl_schedule_cow(schedule);
	if (!schedule)
		goto error;
	isl_schedule_tree_free(schedule->root);
	schedule->root = tree;

	return schedule;
error:
	isl_schedule_free(schedule);
	isl_schedule_tree_free(tree);
	return NULL;
}

isl_ctx *isl_schedule_get_ctx(__isl_keep isl_schedule *schedule)
{
	return schedule ? isl_schedule_tree_get_ctx(schedule->leaf) : NULL;
}

/* Return a pointer to the leaf of "schedule".
 */
__isl_keep isl_schedule_tree *isl_schedule_peek_leaf(
	__isl_keep isl_schedule *schedule)
{
	return schedule ? schedule->leaf : NULL;
}

/* Are "schedule1" and "schedule2" obviously equal to each other?
 */
isl_bool isl_schedule_plain_is_equal(__isl_keep isl_schedule *schedule1,
	__isl_keep isl_schedule *schedule2)
{
	if (!schedule1 || !schedule2)
		return isl_bool_error;
	if (schedule1 == schedule2)
		return isl_bool_true;
	return isl_schedule_tree_plain_is_equal(schedule1->root,
						schedule2->root);
}

/* Return the (parameter) space of the schedule, i.e., the space
 * of the root domain.
 */
__isl_give isl_space *isl_schedule_get_space(
	__isl_keep isl_schedule *schedule)
{
	enum isl_schedule_node_type type;
	isl_space *space;
	isl_union_set *domain;

	if (!schedule)
		return NULL;
	type = isl_schedule_tree_get_type(schedule->root);
	if (type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(schedule), isl_error_internal,
			"root node not a domain node", return NULL);

	domain = isl_schedule_tree_domain_get_domain(schedule->root);
	space = isl_union_set_get_space(domain);
	isl_union_set_free(domain);

	return space;
}

/* Return a pointer to the root of "schedule".
 */
__isl_give isl_schedule_node *isl_schedule_get_root(
	__isl_keep isl_schedule *schedule)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;
	isl_schedule_tree_list *ancestors;

	if (!schedule)
		return NULL;

	ctx = isl_schedule_get_ctx(schedule);
	tree = isl_schedule_tree_copy(schedule->root);
	schedule = isl_schedule_copy(schedule);
	ancestors = isl_schedule_tree_list_alloc(ctx, 0);
	return isl_schedule_node_alloc(schedule, tree, ancestors, NULL);
}

/* Return the domain of the root domain node of "schedule".
 */
__isl_give isl_union_set *isl_schedule_get_domain(
	__isl_keep isl_schedule *schedule)
{
	if (!schedule)
		return NULL;
	return isl_schedule_tree_domain_get_domain(schedule->root);
}

/* Traverse all nodes of "sched" in depth first preorder.
 *
 * If "fn" returns -1 on any of the nodes, then the traversal is aborted.
 * If "fn" returns 0 on any of the nodes, then the subtree rooted
 * at that node is skipped.
 *
 * Return 0 on success and -1 on failure.
 */
isl_stat isl_schedule_foreach_schedule_node_top_down(
	__isl_keep isl_schedule *sched,
	isl_bool (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user)
{
	isl_schedule_node *node;
	isl_stat r;

	if (!sched)
		return isl_stat_error;

	node = isl_schedule_get_root(sched);
	r = isl_schedule_node_foreach_descendant_top_down(node, fn, user);
	isl_schedule_node_free(node);

	return r;
}

/* Traverse the node of "sched" in depth first postorder,
 * allowing the user to modify the visited node.
 * The traversal continues from the node returned by the callback function.
 * It is the responsibility of the user to ensure that this does not
 * lead to an infinite loop.  It is safest to always return a pointer
 * to the same position (same ancestors and child positions) as the input node.
 */
__isl_give isl_schedule *isl_schedule_map_schedule_node_bottom_up(
	__isl_take isl_schedule *schedule,
	__isl_give isl_schedule_node *(*fn)(
		__isl_take isl_schedule_node *node, void *user), void *user)
{
	isl_schedule_node *node;

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);

	node = isl_schedule_node_map_descendant_bottom_up(node, fn, user);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Wrapper around isl_schedule_node_reset_user for use as
 * an isl_schedule_map_schedule_node_bottom_up callback.
 */
static __isl_give isl_schedule_node *reset_user(
	__isl_take isl_schedule_node *node, void *user)
{
	return isl_schedule_node_reset_user(node);
}

/* Reset the user pointer on all identifiers of parameters and tuples
 * in the schedule "schedule".
 */
__isl_give isl_schedule *isl_schedule_reset_user(
	__isl_take isl_schedule *schedule)
{
	return isl_schedule_map_schedule_node_bottom_up(schedule, &reset_user,
							NULL);
}

/* Wrapper around isl_schedule_node_align_params for use as
 * an isl_schedule_map_schedule_node_bottom_up callback.
 */
static __isl_give isl_schedule_node *align_params(
	__isl_take isl_schedule_node *node, void *user)
{
	isl_space *space = user;

	return isl_schedule_node_align_params(node, isl_space_copy(space));
}

/* Align the parameters of all nodes in schedule "schedule"
 * to those of "space".
 */
__isl_give isl_schedule *isl_schedule_align_params(
	__isl_take isl_schedule *schedule, __isl_take isl_space *space)
{
	schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
						    &align_params, space);
	isl_space_free(space);
	return schedule;
}

/* Wrapper around isl_schedule_node_pullback_union_pw_multi_aff for use as
 * an isl_schedule_map_schedule_node_bottom_up callback.
 */
static __isl_give isl_schedule_node *pullback_upma(
	__isl_take isl_schedule_node *node, void *user)
{
	isl_union_pw_multi_aff *upma = user;

	return isl_schedule_node_pullback_union_pw_multi_aff(node,
					isl_union_pw_multi_aff_copy(upma));
}

/* Compute the pullback of "schedule" by the function represented by "upma".
 * In other words, plug in "upma" in the iteration domains of "schedule".
 *
 * The schedule tree is not allowed to contain any expansion nodes.
 */
__isl_give isl_schedule *isl_schedule_pullback_union_pw_multi_aff(
	__isl_take isl_schedule *schedule,
	__isl_take isl_union_pw_multi_aff *upma)
{
	schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
						&pullback_upma, upma);
	isl_union_pw_multi_aff_free(upma);
	return schedule;
}

/* Expand the schedule "schedule" by extending all leaves
 * with an expansion node with as subtree the tree of "expansion".
 * The expansion of the expansion node is determined by "contraction"
 * and the domain of "expansion".  That is, the domain of "expansion"
 * is contracted according to "contraction".
 *
 * Call isl_schedule_node_expand after extracting the required
 * information from "expansion".
 */
__isl_give isl_schedule *isl_schedule_expand(__isl_take isl_schedule *schedule,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_schedule *expansion)
{
	isl_union_set *domain;
	isl_schedule_node *node;
	isl_schedule_tree *tree;

	domain = isl_schedule_get_domain(expansion);

	node = isl_schedule_get_root(expansion);
	node = isl_schedule_node_child(node, 0);
	tree = isl_schedule_node_get_tree(node);
	isl_schedule_node_free(node);
	isl_schedule_free(expansion);

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_expand(node, contraction, domain, tree);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Intersect the domain of the schedule "schedule" with "domain".
 * The root of "schedule" is required to be a domain node.
 */
__isl_give isl_schedule *isl_schedule_intersect_domain(
	__isl_take isl_schedule *schedule, __isl_take isl_union_set *domain)
{
	enum isl_schedule_node_type root_type;
	isl_schedule_node *node;

	if (!schedule || !domain)
		goto error;

	root_type = isl_schedule_tree_get_type(schedule->root);
	if (root_type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(schedule), isl_error_invalid,
			"root node must be a domain node", goto error);

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_domain_intersect_domain(node, domain);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
error:
	isl_schedule_free(schedule);
	isl_union_set_free(domain);
	return NULL;
}

/* Replace the domain of the schedule "schedule" with the gist
 * of the original domain with respect to the parameter domain "context".
 */
__isl_give isl_schedule *isl_schedule_gist_domain_params(
	__isl_take isl_schedule *schedule, __isl_take isl_set *context)
{
	enum isl_schedule_node_type root_type;
	isl_schedule_node *node;

	if (!schedule || !context)
		goto error;

	root_type = isl_schedule_tree_get_type(schedule->root);
	if (root_type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(schedule), isl_error_invalid,
			"root node must be a domain node", goto error);

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_domain_gist_params(node, context);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
error:
	isl_schedule_free(schedule);
	isl_set_free(context);
	return NULL;
}

/* Return an isl_union_map representation of the schedule. In particular,
 * return an isl_union_map corresponding to the subtree schedule of the child
 * of the root domain node.  That is, we do not intersect the domain
 * of the returned isl_union_map with the domain constraints.
 */
__isl_give isl_union_map *isl_schedule_get_map(__isl_keep isl_schedule *sched)
{
	enum isl_schedule_node_type type;
	isl_schedule_node *node;
	isl_union_map *umap;

	if (!sched)
		return NULL;
	type = isl_schedule_tree_get_type(sched->root);
	if (type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(sched), isl_error_internal,
			"root node not a domain node", return NULL);

	node = isl_schedule_get_root(sched);
	node = isl_schedule_node_child(node, 0);
	umap = isl_schedule_node_get_subtree_schedule_union_map(node);
	isl_schedule_node_free(node);

	return umap;
}

/* Insert a band node with partial schedule "partial" between the domain
 * root node of "schedule" and its single child.
 * Return a pointer to the updated schedule.
 *
 * If any of the nodes in the tree depend on the set of outer band nodes
 * then we refuse to insert the band node.
 */
__isl_give isl_schedule *isl_schedule_insert_partial_schedule(
	__isl_take isl_schedule *schedule,
	__isl_take isl_multi_union_pw_aff *partial)
{
	isl_schedule_node *node;
	int anchored;

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	if (!node)
		goto error;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_domain)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
			"root node not a domain node", goto error);

	node = isl_schedule_node_child(node, 0);
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot insert band node in anchored subtree",
			goto error);
	node = isl_schedule_node_insert_partial_schedule(node, partial);

	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
error:
	isl_schedule_node_free(node);
	isl_multi_union_pw_aff_free(partial);
	return NULL;
}

/* Insert a context node with constraints "context" between the domain
 * root node of "schedule" and its single child.
 * Return a pointer to the updated schedule.
 */
__isl_give isl_schedule *isl_schedule_insert_context(
	__isl_take isl_schedule *schedule, __isl_take isl_set *context)
{
	isl_schedule_node *node;

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_insert_context(node, context);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Insert a guard node with constraints "guard" between the domain
 * root node of "schedule" and its single child.
 * Return a pointer to the updated schedule.
 */
__isl_give isl_schedule *isl_schedule_insert_guard(
	__isl_take isl_schedule *schedule, __isl_take isl_set *guard)
{
	isl_schedule_node *node;

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_insert_guard(node, guard);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Return a tree with as top-level node a filter corresponding to "filter" and
 * as child, the (single) child of "tree".
 * However, if this single child is of type "type", then the filter is inserted
 * in the children of this single child instead.
 */
static __isl_give isl_schedule_tree *insert_filter_in_child_of_type(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter,
	enum isl_schedule_node_type type)
{
	if (!isl_schedule_tree_has_children(tree)) {
		isl_schedule_tree_free(tree);
		return isl_schedule_tree_from_filter(filter);
	} else {
		tree = isl_schedule_tree_child(tree, 0);
	}

	if (isl_schedule_tree_get_type(tree) == type)
		tree = isl_schedule_tree_children_insert_filter(tree, filter);
	else
		tree = isl_schedule_tree_insert_filter(tree, filter);

	return tree;
}

/* Construct a schedule that combines the schedules "schedule1" and "schedule2"
 * with a top-level node (underneath the domain node) of type "type",
 * either isl_schedule_node_sequence or isl_schedule_node_set.
 * The domains of the two schedules are assumed to be disjoint.
 *
 * The new schedule has as domain the union of the domains of the two
 * schedules.  The child of the domain node is a node of type "type"
 * with two filters corresponding to the domains of the input schedules.
 * If one (or both) of the top-level nodes of the two schedules is itself
 * of type "type", then the filter is pushed into the children of that
 * node and the sequence or set is flattened.
 */
__isl_give isl_schedule *isl_schedule_pair(enum isl_schedule_node_type type,
	__isl_take isl_schedule *schedule1, __isl_take isl_schedule *schedule2)
{
	int disjoint;
	isl_ctx *ctx;
	enum isl_schedule_node_type root_type;
	isl_schedule_tree *tree1, *tree2;
	isl_union_set *filter1, *filter2, *domain;

	if (!schedule1 || !schedule2)
		goto error;

	root_type = isl_schedule_tree_get_type(schedule1->root);
	if (root_type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(schedule1), isl_error_internal,
			"root node not a domain node", goto error);
	root_type = isl_schedule_tree_get_type(schedule2->root);
	if (root_type != isl_schedule_node_domain)
		isl_die(isl_schedule_get_ctx(schedule1), isl_error_internal,
			"root node not a domain node", goto error);

	ctx = isl_schedule_get_ctx(schedule1);
	tree1 = isl_schedule_tree_copy(schedule1->root);
	filter1 = isl_schedule_tree_domain_get_domain(tree1);
	tree2 = isl_schedule_tree_copy(schedule2->root);
	filter2 = isl_schedule_tree_domain_get_domain(tree2);

	isl_schedule_free(schedule1);
	isl_schedule_free(schedule2);

	disjoint = isl_union_set_is_disjoint(filter1, filter2);
	if (disjoint < 0)
		filter1 = isl_union_set_free(filter1);
	if (!disjoint)
		isl_die(ctx, isl_error_invalid,
			"schedule domains not disjoint",
			filter1 = isl_union_set_free(filter1));

	domain = isl_union_set_union(isl_union_set_copy(filter1),
				    isl_union_set_copy(filter2));
	filter1 = isl_union_set_gist(filter1, isl_union_set_copy(domain));
	filter2 = isl_union_set_gist(filter2, isl_union_set_copy(domain));

	tree1 = insert_filter_in_child_of_type(tree1, filter1, type);
	tree2 = insert_filter_in_child_of_type(tree2, filter2, type);

	tree1 = isl_schedule_tree_from_pair(type, tree1, tree2);
	tree1 = isl_schedule_tree_insert_domain(tree1, domain);

	return isl_schedule_from_schedule_tree(ctx, tree1);
error:
	isl_schedule_free(schedule1);
	isl_schedule_free(schedule2);
	return NULL;
}

/* Construct a schedule that combines the schedules "schedule1" and "schedule2"
 * through a sequence node.
 * The domains of the input schedules are assumed to be disjoint.
 */
__isl_give isl_schedule *isl_schedule_sequence(
	__isl_take isl_schedule *schedule1, __isl_take isl_schedule *schedule2)
{
	return isl_schedule_pair(isl_schedule_node_sequence,
				schedule1, schedule2);
}

/* Construct a schedule that combines the schedules "schedule1" and "schedule2"
 * through a set node.
 * The domains of the input schedules are assumed to be disjoint.
 */
__isl_give isl_schedule *isl_schedule_set(
	__isl_take isl_schedule *schedule1, __isl_take isl_schedule *schedule2)
{
	return isl_schedule_pair(isl_schedule_node_set, schedule1, schedule2);
}

/* Print "schedule" to "p".
 */
__isl_give isl_printer *isl_printer_print_schedule(__isl_take isl_printer *p,
	__isl_keep isl_schedule *schedule)
{
	if (!schedule)
		return isl_printer_free(p);

	return isl_printer_print_schedule_tree(p, schedule->root);
}

/* AutoSA Extended */
/* Return a new duplicate schedule of "sched".
 */
__isl_give isl_schedule *isl_schedule_dup(__isl_keep isl_schedule *sched)
{
	if (!sched)
		return NULL;

  isl_schedule_tree *tree = isl_schedule_tree_dup(sched->root);
  isl_schedule *new_sched = isl_schedule_from_schedule_tree(
      isl_schedule_get_ctx(sched), tree);
	
	return new_sched;
}
/* AutoSA Extended */

#undef BASE
#define BASE schedule
#include <print_templ_yaml.c>


================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule_band.c
================================================
/*
 * Copyright 2013-2014 Ecole Normale Superieure
 * Copyright 2014      INRIA Rocquencourt
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 * and Inria Paris - Rocquencourt, Domaine de Voluceau - Rocquencourt,
 * B.P. 105 - 78153 Le Chesnay, France
 */

#include <string.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/map.h>
#include <isl/schedule_node.h>
#include <isl_schedule_band.h>
#include <isl_schedule_private.h>

isl_ctx *isl_schedule_band_get_ctx(__isl_keep isl_schedule_band *band)
{
	return band ? isl_multi_union_pw_aff_get_ctx(band->mupa) : NULL;
}

/* Return a new uninitialized isl_schedule_band.
 */
static __isl_give isl_schedule_band *isl_schedule_band_alloc(isl_ctx *ctx)
{
	isl_schedule_band *band;

	band = isl_calloc_type(ctx, isl_schedule_band);
	if (!band)
		return NULL;

	band->ref = 1;

	return band;
}

/* Return a new isl_schedule_band with partial schedule "mupa".
 * First replace "mupa" by its greatest integer part to ensure
 * that the schedule is always integral.
 * The band is not marked permutable, the dimensions are not
 * marked coincident and the AST build options are empty.
 * Since there are no build options, the node is not anchored.
 */
__isl_give isl_schedule_band *isl_schedule_band_from_multi_union_pw_aff(
	__isl_take isl_multi_union_pw_aff *mupa)
{
	isl_size dim;
	isl_ctx *ctx;
	isl_schedule_band *band;
	isl_space *space;

	mupa = isl_multi_union_pw_aff_floor(mupa);
	dim = isl_multi_union_pw_aff_size(mupa);
	if (dim < 0)
		goto error;
	ctx = isl_multi_union_pw_aff_get_ctx(mupa);
	band = isl_schedule_band_alloc(ctx);
	if (!band)
		goto error;

	band->n = dim;
	band->coincident = isl_calloc_array(ctx, int, band->n);
	/* AutoSA Extended */
	band->space_time = isl_calloc_array(ctx, enum autosa_loop_type, band->n);
  	band->pe_opt = isl_calloc_array(ctx, enum autosa_loop_type, band->n);
	band->sched_pos = isl_calloc_array(ctx, int, band->n);
	for (int i = 0; i < band->n; ++i) {
		band->sched_pos[i] = -1;
		band->iter[i] = NULL;
	}
	/* AutoSA Extended */
	band->mupa = mupa;
	space = isl_space_params_alloc(ctx, 0);
	band->ast_build_options = isl_union_set_empty(space);
	band->anchored = 0;

	if ((band->n && !band->coincident) || !band->ast_build_options)
		return isl_schedule_band_free(band);

	return band;
error:
	isl_multi_union_pw_aff_free(mupa);
	return NULL;
}

/* Create a duplicate of the given isl_schedule_band.
 */
__isl_give isl_schedule_band *isl_schedule_band_dup(
	__isl_keep isl_schedule_band *band)
{
	int i;
	isl_ctx *ctx;
	isl_schedule_band *dup;

	if (!band)
		return NULL;

	ctx = isl_schedule_band_get_ctx(band);
	dup = isl_schedule_band_alloc(ctx);
	if (!dup)
		return NULL;

	dup->n = band->n;
	dup->coincident = isl_alloc_array(ctx, int, band->n);
	if (band->n && !dup->coincident)
		return isl_schedule_band_free(dup);

	for (i = 0; i < band->n; ++i)
		dup->coincident[i] = band->coincident[i];
	dup->permutable = band->permutable;

	/* AutoSA Extended */
    if (band->space_time) {
      dup->space_time = isl_alloc_array(ctx, enum autosa_loop_type, band->n);
      for (i = 0; i < band->n; ++i)
        dup->space_time[i] = band->space_time[i];
    }
    if (band->pe_opt) {
      dup->pe_opt = isl_alloc_array(ctx, enum autosa_loop_type, band->n);
      for (i = 0; i < band->n; ++i)
        dup->pe_opt[i] = band->pe_opt[i];
    }	
	if (band->sched_pos) {
      dup->sched_pos = isl_alloc_array(ctx, int, band->n);
      for (i = 0; i < band->n; ++i)
        dup->sched_pos[i] = band->sched_pos[i];
    }	
	if (band->iter) {      
      for (i = 0; i < band->n; ++i)
        dup->iter[i] = band->iter[i];
    }	
	/* AutoSA Extended */

	dup->mupa = isl_multi_union_pw_aff_copy(band->mupa);
	dup->ast_build_options = isl_union_set_copy(band->ast_build_options);
	if (!dup->mupa || !dup->ast_build_options)
		return isl_schedule_band_free(dup);

	if (band->loop_type) {
		dup->loop_type = isl_alloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !dup->loop_type)
			return isl_schedule_band_free(dup);
		for (i = 0; i < band->n; ++i)
			dup->loop_type[i] = band->loop_type[i];
	}
	if (band->isolate_loop_type) {
		dup->isolate_loop_type = isl_alloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !dup->isolate_loop_type)
			return isl_schedule_band_free(dup);
		for (i = 0; i < band->n; ++i)
			dup->isolate_loop_type[i] = band->isolate_loop_type[i];
	}

	return dup;
}

/* Return an isl_schedule_band that is equal to "band" and that has only
 * a single reference.
 */
__isl_give isl_schedule_band *isl_schedule_band_cow(
	__isl_take isl_schedule_band *band)
{
	if (!band)
		return NULL;

	if (band->ref == 1)
		return band;
	band->ref--;
	return isl_schedule_band_dup(band);
}

/* Return a new reference to "band".
 */
__isl_give isl_schedule_band *isl_schedule_band_copy(
	__isl_keep isl_schedule_band *band)
{
	if (!band)
		return NULL;

	band->ref++;
	return band;
}

/* Free a reference to "band" and return NULL.
 */
__isl_null isl_schedule_band *isl_schedule_band_free(
	__isl_take isl_schedule_band *band)
{
	if (!band)
		return NULL;

	if (--band->ref > 0)
		return NULL;

	isl_multi_union_pw_aff_free(band->mupa);
	isl_union_set_free(band->ast_build_options);
	free(band->loop_type);
	free(band->isolate_loop_type);
	free(band->coincident);
	/* AutoSA Extended */
	free(band->space_time);
	free(band->pe_opt);
	free(band->sched_pos);
	/* AutoSA Extended */
	free(band);

	return NULL;
}

/* Are "band1" and "band2" obviously equal?
 */
isl_bool isl_schedule_band_plain_is_equal(__isl_keep isl_schedule_band *band1,
	__isl_keep isl_schedule_band *band2)
{
	int i;
	isl_bool equal;

	if (!band1 || !band2)
		return isl_bool_error;
	if (band1 == band2)
		return isl_bool_true;

	if (band1->n != band2->n)
		return isl_bool_false;
	for (i = 0; i < band1->n; ++i)
		if (band1->coincident[i] != band2->coincident[i])
			return isl_bool_false;
	if (band1->permutable != band2->permutable)
		return isl_bool_false;

	equal = isl_multi_union_pw_aff_plain_is_equal(band1->mupa, band2->mupa);
	if (equal < 0 || !equal)
		return equal;

	if (!band1->loop_type != !band2->loop_type)
		return isl_bool_false;
	if (band1->loop_type)
		for (i = 0; i < band1->n; ++i)
			if (band1->loop_type[i] != band2->loop_type[i])
				return isl_bool_false;

	if (!band1->isolate_loop_type != !band2->isolate_loop_type)
		return isl_bool_false;
	if (band1->isolate_loop_type)
		for (i = 0; i < band1->n; ++i)
			if (band1->isolate_loop_type[i] !=
						band2->isolate_loop_type[i])
				return isl_bool_false;

	return isl_union_set_is_equal(band1->ast_build_options,
					band2->ast_build_options);
}

/* Return the number of scheduling dimensions in the band.
 */
isl_size isl_schedule_band_n_member(__isl_keep isl_schedule_band *band)
{
	return band ? band->n : isl_size_error;
}

/* Is the given scheduling dimension coincident within the band and
 * with respect to the coincidence constraints?
 */
isl_bool isl_schedule_band_member_get_coincident(
	__isl_keep isl_schedule_band *band, int pos)
{
	if (!band)
		return isl_bool_error;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position", return isl_bool_error);

	return isl_bool_ok(band->coincident[pos]);
}

/* Mark the given scheduling dimension as being coincident or not
 * according to "coincident".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_coincident(
	__isl_take isl_schedule_band *band, int pos, int coincident)
{
	if (!band)
		return NULL;
	if (isl_schedule_band_member_get_coincident(band, pos) == coincident)
		return band;
	band = isl_schedule_band_cow(band);
	if (!band)
		return NULL;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position",
			return isl_schedule_band_free(band));

	band->coincident[pos] = coincident;

	return band;
}

/* Is the schedule band mark permutable?
 */
isl_bool isl_schedule_band_get_permutable(__isl_keep isl_schedule_band *band)
{
	if (!band)
		return isl_bool_error;
	return isl_bool_ok(band->permutable);
}

/* Mark the schedule band permutable or not according to "permutable"?
 */
__isl_give isl_schedule_band *isl_schedule_band_set_permutable(
	__isl_take isl_schedule_band *band, int permutable)
{
	if (!band)
		return NULL;
	if (band->permutable == permutable)
		return band;
	band = isl_schedule_band_cow(band);
	if (!band)
		return NULL;

	band->permutable = permutable;

	return band;
}

/* Is the band node "node" anchored?  That is, does it reference
 * the outer band nodes?
 */
int isl_schedule_band_is_anchored(__isl_keep isl_schedule_band *band)
{
	return band ? band->anchored : -1;
}

/* Return the schedule space of the band.
 */
__isl_give isl_space *isl_schedule_band_get_space(
	__isl_keep isl_schedule_band *band)
{
	if (!band)
		return NULL;
	return isl_multi_union_pw_aff_get_space(band->mupa);
}

/* Intersect the domain of the band schedule of "band" with "domain".
 */
__isl_give isl_schedule_band *isl_schedule_band_intersect_domain(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *domain)
{
	band = isl_schedule_band_cow(band);
	if (!band || !domain)
		goto error;

	band->mupa = isl_multi_union_pw_aff_intersect_domain(band->mupa,
								domain);
	if (!band->mupa)
		return isl_schedule_band_free(band);

	return band;
error:
	isl_schedule_band_free(band);
	isl_union_set_free(domain);
	return NULL;
}

/* Return the schedule of the band in isolation.
 */
__isl_give isl_multi_union_pw_aff *isl_schedule_band_get_partial_schedule(
	__isl_keep isl_schedule_band *band)
{
	return band ? isl_multi_union_pw_aff_copy(band->mupa) : NULL;
}

/* Replace the schedule of "band" by "schedule".
 */
__isl_give isl_schedule_band *isl_schedule_band_set_partial_schedule(
	__isl_take isl_schedule_band *band,
	__isl_take isl_multi_union_pw_aff *schedule)
{
	band = isl_schedule_band_cow(band);
	if (!band || !schedule)
		goto error;

	isl_multi_union_pw_aff_free(band->mupa);
	band->mupa = schedule;

	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_union_pw_aff_free(schedule);
	return NULL;
}

/* Return the loop AST generation type for the band member of "band"
 * at position "pos".
 */
enum isl_ast_loop_type isl_schedule_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_band *band, int pos)
{
	if (!band)
		return isl_ast_loop_error;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position", return isl_ast_loop_error);

	if (!band->loop_type)
		return isl_ast_loop_default;

	return band->loop_type[pos];
}

/* Set the loop AST generation type for the band member of "band"
 * at position "pos" to "type".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_ast_loop_type(
	__isl_take isl_schedule_band *band, int pos,
	enum isl_ast_loop_type type)
{
	if (!band)
		return NULL;
	if (isl_schedule_band_member_get_ast_loop_type(band, pos) == type)
		return band;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position",
			return isl_schedule_band_free(band));

	band = isl_schedule_band_cow(band);
	if (!band)
		return isl_schedule_band_free(band);

	if (!band->loop_type) {
		isl_ctx *ctx;

		ctx = isl_schedule_band_get_ctx(band);
		band->loop_type = isl_calloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !band->loop_type)
			return isl_schedule_band_free(band);
	}

	band->loop_type[pos] = type;

	return band;
}

/* Return the loop AST generation type for the band member of "band"
 * at position "pos" for the part that has been isolated by the isolate option.
 */
enum isl_ast_loop_type isl_schedule_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_band *band, int pos)
{
	if (!band)
		return isl_ast_loop_error;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position", return isl_ast_loop_error);

	if (!band->isolate_loop_type)
		return isl_ast_loop_default;

	return band->isolate_loop_type[pos];
}

/* Set the loop AST generation type for the band member of "band"
 * at position "pos" to "type" for the part that has been isolated
 * by the isolate option.
 */
__isl_give isl_schedule_band *
isl_schedule_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_band *band, int pos,
	enum isl_ast_loop_type type)
{
	if (!band)
		return NULL;
	if (isl_schedule_band_member_get_isolate_ast_loop_type(band, pos) ==
									type)
		return band;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"invalid member position",
			return isl_schedule_band_free(band));

	band = isl_schedule_band_cow(band);
	if (!band)
		return isl_schedule_band_free(band);

	if (!band->isolate_loop_type) {
		isl_ctx *ctx;

		ctx = isl_schedule_band_get_ctx(band);
		band->isolate_loop_type = isl_calloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !band->isolate_loop_type)
			return isl_schedule_band_free(band);
	}

	band->isolate_loop_type[pos] = type;

	return band;
}

static const char *option_str[] = {
	[isl_ast_loop_atomic] = "atomic",
	[isl_ast_loop_unroll] = "unroll",
	[isl_ast_loop_separate] = "separate"
};

/* Given a parameter space "space", extend it to a set space
 *
 *	{ type[x] }
 *
 * or
 *
 *	{ [isolate[] -> type[x]] }
 *
 * depending on whether "isolate" is set.
 * These can be used to encode loop AST generation options of the given type.
 */
static __isl_give isl_space *loop_type_space(__isl_take isl_space *space,
	enum isl_ast_loop_type type, int isolate)
{
	const char *name;

	name = option_str[type];
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, 1);
	space = isl_space_set_tuple_name(space, isl_dim_set, name);
	if (!isolate)
		return space;
	space = isl_space_from_range(space);
	space = isl_space_set_tuple_name(space, isl_dim_in, "isolate");
	space = isl_space_wrap(space);

	return space;
}

/* Add encodings of the "n" loop AST generation options "type" to "options".
 * If "isolate" is set, then these options refer to the isolated part.
 *
 * In particular, for each sequence of consecutive identical types "t",
 * different from the default, add an option
 *
 *	{ t[x] : first <= x <= last }
 *
 * or
 *
 *	{ [isolate[] -> t[x]] : first <= x <= last }
 */
static __isl_give isl_union_set *add_loop_types(
	__isl_take isl_union_set *options, int n, enum isl_ast_loop_type *type,
	int isolate)
{
	int i;

	if (!type)
		return options;
	if (!options)
		return NULL;

	for (i = 0; i < n; ++i) {
		int first;
		isl_space *space;
		isl_set *option;

		if (type[i] == isl_ast_loop_default)
			continue;

		first = i;
		while (i + 1 < n && type[i + 1] == type[i])
			++i;

		space = isl_union_set_get_space(options);
		space = loop_type_space(space, type[i], isolate);
		option = isl_set_universe(space);
		option = isl_set_lower_bound_si(option, isl_dim_set, 0, first);
		option = isl_set_upper_bound_si(option, isl_dim_set, 0, i);
		options = isl_union_set_add_set(options, option);
	}

	return options;
}

/* Return the AST build options associated to "band".
 */
__isl_give isl_union_set *isl_schedule_band_get_ast_build_options(
	__isl_keep isl_schedule_band *band)
{
	isl_union_set *options;

	if (!band)
		return NULL;

	options = isl_union_set_copy(band->ast_build_options);
	options = add_loop_types(options, band->n, band->loop_type, 0);
	options = add_loop_types(options, band->n, band->isolate_loop_type, 1);

	return options;
}

/* Internal data structure for not().
 */
struct isl_not_data {
	isl_bool (*is)(__isl_keep isl_set *set);
};

/* Does "set" not satisfy data->is()?
 */
static isl_bool not(__isl_keep isl_set *set, void *user)
{
	struct isl_not_data *data = user;

	return isl_bool_not(data->is(set));
}

/* Does "uset" contain any set that satisfies "is"?
 * In other words, is it not the case that all of them do not satisfy "is"?
 */
static isl_bool has_any(__isl_keep isl_union_set *uset,
	isl_bool (*is)(__isl_keep isl_set *set))
{
	struct isl_not_data data = { is };

	return isl_bool_not(isl_union_set_every_set(uset, &not, &data));
}

/* Does "set" live in a space of the form
 *
 *	isolate[[...] -> [...]]
 *
 * ?
 */
static isl_bool is_isolate(__isl_keep isl_set *set)
{
	if (isl_set_has_tuple_name(set)) {
		const char *name;
		name = isl_set_get_tuple_name(set);
		if (isl_set_is_wrapping(set) && !strcmp(name, "isolate"))
			return isl_bool_true;
	}

	return isl_bool_false;
}

/* Does "options" include an option of the ofrm
 *
 *	isolate[[...] -> [...]]
 *
 * ?
 */
static isl_bool has_isolate_option(__isl_keep isl_union_set *options)
{
	return has_any(options, &is_isolate);
}

/* Does "set" encode a loop AST generation option?
 */
static isl_bool is_loop_type_option(__isl_keep isl_set *set)
{
	isl_size dim;

	dim = isl_set_dim(set, isl_dim_set);
	if (dim < 0)
		return isl_bool_error;
	if (dim == 1 && isl_set_has_tuple_name(set)) {
		const char *name;
		enum isl_ast_loop_type type;
		name = isl_set_get_tuple_name(set);
		for (type = isl_ast_loop_atomic;
		    type <= isl_ast_loop_separate; ++type) {
			if (strcmp(name, option_str[type]))
				continue;
			return isl_bool_true;
		}
	}

	return isl_bool_false;
}

/* Does "set" encode a loop AST generation option for the isolated part?
 * That is, is of the form
 *
 *	{ [isolate[] -> t[x]] }
 *
 * with t equal to "atomic", "unroll" or "separate"?
 */
static isl_bool is_isolate_loop_type_option(__isl_keep isl_set *set)
{
	const char *name;
	enum isl_ast_loop_type type;
	isl_map *map;

	if (!isl_set_is_wrapping(set))
		return isl_bool_false;
	map = isl_set_unwrap(isl_set_copy(set));
	if (!isl_map_has_tuple_name(map, isl_dim_in) ||
	    !isl_map_has_tuple_name(map, isl_dim_out)) {
		isl_map_free(map);
		return isl_bool_false;
	}
	name = isl_map_get_tuple_name(map, isl_dim_in);
	if (!strcmp(name, "isolate")) {
		name = isl_map_get_tuple_name(map, isl_dim_out);
		for (type = isl_ast_loop_atomic;
		    type <= isl_ast_loop_separate; ++type) {
			if (strcmp(name, option_str[type]))
				continue;
			isl_map_free(map);
			return isl_bool_true;
		}
	}
	isl_map_free(map);

	return isl_bool_false;
}

/* Does "options" encode any loop AST generation options
 * for the isolated part?
 */
static isl_bool has_isolate_loop_type_options(__isl_keep isl_union_set *options)
{
	return has_any(options, &is_isolate_loop_type_option);
}

/* Does "options" encode any loop AST generation options?
 */
static isl_bool has_loop_type_options(__isl_keep isl_union_set *options)
{
	return has_any(options, &is_loop_type_option);
}

/* Extract the loop AST generation type for the band member
 * at position "pos" from "options".
 * If "isolate" is set, then extract the loop types for the isolated part.
 */
static enum isl_ast_loop_type extract_loop_type(
	__isl_keep isl_union_set *options, int pos, int isolate)
{
	isl_ctx *ctx;
	enum isl_ast_loop_type type, res = isl_ast_loop_default;

	ctx = isl_union_set_get_ctx(options);
	for (type = isl_ast_loop_atomic;
	    type <= isl_ast_loop_separate; ++type) {
		isl_space *space;
		isl_set *option;
		int empty;

		space = isl_union_set_get_space(options);
		space = loop_type_space(space, type, isolate);
		option = isl_union_set_extract_set(options, space);
		option = isl_set_fix_si(option, isl_dim_set, 0, pos);
		empty = isl_set_is_empty(option);
		isl_set_free(option);

		if (empty < 0)
			return isl_ast_loop_error;
		if (empty)
			continue;
		if (res != isl_ast_loop_default)
			isl_die(ctx, isl_error_invalid,
				"conflicting loop type options",
				return isl_ast_loop_error);
		res = type;
	}

	return res;
}

/* Extract the loop AST generation types for the members of "band"
 * from "options" and store them in band->loop_type.
 * Return -1 on error.
 */
static int extract_loop_types(__isl_keep isl_schedule_band *band,
	__isl_keep isl_union_set *options)
{
	int i;

	if (!band->loop_type) {
		isl_ctx *ctx = isl_schedule_band_get_ctx(band);
		band->loop_type = isl_alloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !band->loop_type)
			return -1;
	}
	for (i = 0; i < band->n; ++i) {
		band->loop_type[i] = extract_loop_type(options, i, 0);
		if (band->loop_type[i] == isl_ast_loop_error)
			return -1;
	}

	return 0;
}

/* Extract the loop AST generation types for the members of "band"
 * from "options" for the isolated part and
 * store them in band->isolate_loop_type.
 * Return -1 on error.
 */
static int extract_isolate_loop_types(__isl_keep isl_schedule_band *band,
	__isl_keep isl_union_set *options)
{
	int i;

	if (!band->isolate_loop_type) {
		isl_ctx *ctx = isl_schedule_band_get_ctx(band);
		band->isolate_loop_type = isl_alloc_array(ctx,
					    enum isl_ast_loop_type, band->n);
		if (band->n && !band->isolate_loop_type)
			return -1;
	}
	for (i = 0; i < band->n; ++i) {
		band->isolate_loop_type[i] = extract_loop_type(options, i, 1);
		if (band->isolate_loop_type[i] == isl_ast_loop_error)
			return -1;
	}

	return 0;
}

/* Construct universe sets of the spaces that encode loop AST generation
 * types (for the isolated part if "isolate" is set).  That is, construct
 *
 *	{ atomic[x]; separate[x]; unroll[x] }
 *
 * or
 *
 *	{ [isolate[] -> atomic[x]]; [isolate[] -> separate[x]];
 *	  [isolate[] -> unroll[x]] }
 */
static __isl_give isl_union_set *loop_types(__isl_take isl_space *space,
	int isolate)
{
	enum isl_ast_loop_type type;
	isl_union_set *types;

	types = isl_union_set_empty(space);
	for (type = isl_ast_loop_atomic;
	    type <= isl_ast_loop_separate; ++type) {
		isl_set *set;

		space = isl_union_set_get_space(types);
		space = loop_type_space(space, type, isolate);
		set = isl_set_universe(space);
		types = isl_union_set_add_set(types, set);
	}

	return types;
}

/* Remove all elements from spaces that encode loop AST generation types
 * from "options".
 */
static __isl_give isl_union_set *clear_loop_types(
	__isl_take isl_union_set *options)
{
	isl_union_set *types;

	types = loop_types(isl_union_set_get_space(options), 0);
	options = isl_union_set_subtract(options, types);

	return options;
}

/* Remove all elements from spaces that encode loop AST generation types
 * for the isolated part from "options".
 */
static __isl_give isl_union_set *clear_isolate_loop_types(
	__isl_take isl_union_set *options)
{
	isl_union_set *types;

	types = loop_types(isl_union_set_get_space(options), 1);
	options = isl_union_set_subtract(options, types);

	return options;
}

/* Replace the AST build options associated to "band" by "options".
 * If there are any loop AST generation type options, then they
 * are extracted and stored in band->loop_type.  Otherwise,
 * band->loop_type is removed to indicate that the default applies
 * to all members.  Similarly for the loop AST generation type options
 * for the isolated part, which are stored in band->isolate_loop_type.
 * The remaining options are stored in band->ast_build_options.
 *
 * Set anchored if the options include an isolate option since the
 * domain of the wrapped map references the outer band node schedules.
 */
__isl_give isl_schedule_band *isl_schedule_band_set_ast_build_options(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *options)
{
	isl_bool has_isolate, has_loop_type, has_isolate_loop_type;

	band = isl_schedule_band_cow(band);
	if (!band || !options)
		goto error;
	has_isolate = has_isolate_option(options);
	if (has_isolate < 0)
		goto error;
	has_loop_type = has_loop_type_options(options);
	if (has_loop_type < 0)
		goto error;
	has_isolate_loop_type = has_isolate_loop_type_options(options);
	if (has_isolate_loop_type < 0)
		goto error;

	if (!has_loop_type) {
		free(band->loop_type);
		band->loop_type = NULL;
	} else {
		if (extract_loop_types(band, options) < 0)
			goto error;
		options = clear_loop_types(options);
		if (!options)
			goto error;
	}

	if (!has_isolate_loop_type) {
		free(band->isolate_loop_type);
		band->isolate_loop_type = NULL;
	} else {
		if (extract_isolate_loop_types(band, options) < 0)
			goto error;
		options = clear_isolate_loop_types(options);
		if (!options)
			goto error;
	}

	isl_union_set_free(band->ast_build_options);
	band->ast_build_options = options;
	band->anchored = has_isolate;

	return band;
error:
	isl_schedule_band_free(band);
	isl_union_set_free(options);
	return NULL;
}

/* Return the "isolate" option associated to "band", assuming
 * it at appears at schedule depth "depth".
 *
 * The isolate option is of the form
 *
 *	isolate[[flattened outer bands] -> band]
 */
__isl_give isl_set *isl_schedule_band_get_ast_isolate_option(
	__isl_keep isl_schedule_band *band, int depth)
{
	isl_space *space;
	isl_set *isolate;

	if (!band)
		return NULL;

	space = isl_schedule_band_get_space(band);
	space = isl_space_from_range(space);
	space = isl_space_add_dims(space, isl_dim_in, depth);
	space = isl_space_wrap(space);
	space = isl_space_set_tuple_name(space, isl_dim_set, "isolate");

	isolate = isl_union_set_extract_set(band->ast_build_options, space);

	return isolate;
}

/* Replace the option "drop" in the AST build options by "add".
 * That is, remove "drop" and add "add".
 */
__isl_give isl_schedule_band *isl_schedule_band_replace_ast_build_option(
	__isl_take isl_schedule_band *band, __isl_take isl_set *drop,
	__isl_take isl_set *add)
{
	isl_union_set *options;

	band = isl_schedule_band_cow(band);
	if (!band)
		goto error;

	options = band->ast_build_options;
	options = isl_union_set_subtract(options, isl_union_set_from_set(drop));
	options = isl_union_set_union(options, isl_union_set_from_set(add));
	band->ast_build_options = options;

	if (!band->ast_build_options)
		return isl_schedule_band_free(band);

	return band;
error:
	isl_schedule_band_free(band);
	isl_set_free(drop);
	isl_set_free(add);
	return NULL;
}

/* Multiply the partial schedule of "band" with the factors in "mv".
 * Replace the result by its greatest integer part to ensure
 * that the schedule is always integral.
 */
__isl_give isl_schedule_band *isl_schedule_band_scale(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv)
{
	band = isl_schedule_band_cow(band);
	if (!band || !mv)
		goto error;
	band->mupa = isl_multi_union_pw_aff_scale_multi_val(band->mupa, mv);
	band->mupa = isl_multi_union_pw_aff_floor(band->mupa);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_val_free(mv);
	return NULL;
}

/* Divide the partial schedule of "band" by the factors in "mv".
 * Replace the result by its greatest integer part to ensure
 * that the schedule is always integral.
 */
__isl_give isl_schedule_band *isl_schedule_band_scale_down(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv)
{
	band = isl_schedule_band_cow(band);
	if (!band || !mv)
		goto error;
	band->mupa = isl_multi_union_pw_aff_scale_down_multi_val(band->mupa,
								mv);
	band->mupa = isl_multi_union_pw_aff_floor(band->mupa);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_val_free(mv);
	return NULL;
}

/* Reduce the partial schedule of "band" modulo the factors in "mv".
 */
__isl_give isl_schedule_band *isl_schedule_band_mod(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv)
{
	band = isl_schedule_band_cow(band);
	if (!band || !mv)
		goto error;
	band->mupa = isl_multi_union_pw_aff_mod_multi_val(band->mupa, mv);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_val_free(mv);
	return NULL;
}

/* Shift the partial schedule of "band" by "shift" after checking
 * that the domain of the partial schedule would not be affected
 * by this shift.
 */
__isl_give isl_schedule_band *isl_schedule_band_shift(
	__isl_take isl_schedule_band *band,
	__isl_take isl_multi_union_pw_aff *shift)
{
	isl_union_set *dom1, *dom2;
	isl_bool subset;

	band = isl_schedule_band_cow(band);
	if (!band || !shift)
		goto error;
	dom1 = isl_multi_union_pw_aff_domain(
				isl_multi_union_pw_aff_copy(band->mupa));
	dom2 = isl_multi_union_pw_aff_domain(
				isl_multi_union_pw_aff_copy(shift));
	subset = isl_union_set_is_subset(dom1, dom2);
	isl_union_set_free(dom1);
	isl_union_set_free(dom2);
	if (subset < 0)
		goto error;
	if (!subset)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
			"domain of shift needs to include domain of "
			"partial schedule", goto error);
	band->mupa = isl_multi_union_pw_aff_add(band->mupa, shift);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_union_pw_aff_free(shift);
	return NULL;
}

/* Given the schedule of a band, construct the corresponding
 * schedule for the tile loops based on the given tile sizes
 * and return the result.
 *
 * If the scale tile loops options is set, then the tile loops
 * are scaled by the tile sizes.
 *
 * That is replace each schedule dimension "i" by either
 * "floor(i/s)" or "s * floor(i/s)".
 */
static isl_multi_union_pw_aff *isl_multi_union_pw_aff_tile(
	__isl_take isl_multi_union_pw_aff *sched,
	__isl_take isl_multi_val *sizes)
{
	isl_ctx *ctx;
	int i;
	isl_size n;
	isl_val *v;
	int scale;

	ctx = isl_multi_val_get_ctx(sizes);
	scale = isl_options_get_tile_scale_tile_loops(ctx);

	n = isl_multi_union_pw_aff_size(sched);
	if (n < 0)
		sched = isl_multi_union_pw_aff_free(sched);
	for (i = 0; i < n; ++i) {
		isl_union_pw_aff *upa;

		upa = isl_multi_union_pw_aff_get_union_pw_aff(sched, i);
		v = isl_multi_val_get_val(sizes, i);

		upa = isl_union_pw_aff_scale_down_val(upa, isl_val_copy(v));
		upa = isl_union_pw_aff_floor(upa);
		if (scale)
			upa = isl_union_pw_aff_scale_val(upa, isl_val_copy(v));
		isl_val_free(v);

		sched = isl_multi_union_pw_aff_set_union_pw_aff(sched, i, upa);
	}

	isl_multi_val_free(sizes);
	return sched;
}

/* Replace "band" by a band corresponding to the tile loops of a tiling
 * with the given tile sizes.
 */
__isl_give isl_schedule_band *isl_schedule_band_tile(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *sizes)
{
	band = isl_schedule_band_cow(band);
	if (!band || !sizes)
		goto error;
	band->mupa = isl_multi_union_pw_aff_tile(band->mupa, sizes);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_val_free(sizes);
	return NULL;
}

/* Replace "band" by a band corresponding to the point loops of a tiling
 * with the given tile sizes.
 * "tile" is the corresponding tile loop band.
 *
 * If the shift point loops option is set, then the point loops
 * are shifted to start at zero.  That is, each schedule dimension "i"
 * is replaced by "i - s * floor(i/s)".
 * The expression "floor(i/s)" (or "s * floor(i/s)") is extracted from
 * the tile band.
 *
 * Otherwise, the band is left untouched.
 */
__isl_give isl_schedule_band *isl_schedule_band_point(
	__isl_take isl_schedule_band *band, __isl_keep isl_schedule_band *tile,
	__isl_take isl_multi_val *sizes)
{
	isl_ctx *ctx;
	isl_multi_union_pw_aff *scaled;

	if (!band || !sizes)
		goto error;

	ctx = isl_schedule_band_get_ctx(band);
	if (!isl_options_get_tile_shift_point_loops(ctx)) {
		isl_multi_val_free(sizes);
		return band;
	}
	band = isl_schedule_band_cow(band);
	if (!band)
		goto error;

	scaled = isl_schedule_band_get_partial_schedule(tile);
	if (!isl_options_get_tile_scale_tile_loops(ctx))
		scaled = isl_multi_union_pw_aff_scale_multi_val(scaled, sizes);
	else
		isl_multi_val_free(sizes);
	band->mupa = isl_multi_union_pw_aff_sub(band->mupa, scaled);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_schedule_band_free(band);
	isl_multi_val_free(sizes);
	return NULL;
}

/* Drop the "n" dimensions starting at "pos" from "band".
 *
 * We apply the transformation even if "n" is zero to ensure consistent
 * behavior with respect to changes in the schedule space.
 *
 * The caller is responsible for updating the isolate option.
 */
__isl_give isl_schedule_band *isl_schedule_band_drop(
	__isl_take isl_schedule_band *band, int pos, int n)
{
	int i;

	if (pos < 0 || n < 0 || pos + n > band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_internal,
			"range out of bounds",
			return isl_schedule_band_free(band));

	band = isl_schedule_band_cow(band);
	if (!band)
		return NULL;

	band->mupa = isl_multi_union_pw_aff_drop_dims(band->mupa,
							isl_dim_set, pos, n);
	if (!band->mupa)
		return isl_schedule_band_free(band);

	for (i = pos + n; i < band->n; ++i)
		band->coincident[i - n] = band->coincident[i];
	if (band->loop_type)
		for (i = pos + n; i < band->n; ++i)
			band->loop_type[i - n] = band->loop_type[i];
	if (band->isolate_loop_type)
		for (i = pos + n; i < band->n; ++i)
			band->isolate_loop_type[i - n] =
						    band->isolate_loop_type[i];
	/* AutoSA Extended */								
  	if (band->space_time)
  	  for (i = pos + n; i < band->n; ++i)
  	    band->space_time[i - n] = band->space_time[i];
  	if (band->pe_opt)
  	  for (i = pos + n; i < band->n; ++i)
  	    band->pe_opt[i - n] = band->pe_opt[i];	
	if (band->sched_pos)
      for (i = pos + n; i < band->n; ++i)
        band->sched_pos[i - n] = band->sched_pos[i];
	if (band->iter)
      for (i = pos + n; i < band->n; ++i)
        band->iter[i - n] = band->iter[i];
	/* AutoSA Extended */

	band->n -= n;

	return band;
}

/* Reset the user pointer on all identifiers of parameters and tuples
 * in "band".
 */
__isl_give isl_schedule_band *isl_schedule_band_reset_user(
	__isl_take isl_schedule_band *band)
{
	band = isl_schedule_band_cow(band);
	if (!band)
		return NULL;

	band->mupa = isl_multi_union_pw_aff_reset_user(band->mupa);
	band->ast_build_options =
		isl_union_set_reset_user(band->ast_build_options);
	if (!band->mupa || !band->ast_build_options)
		return isl_schedule_band_free(band);

	return band;
}

/* Align the parameters of "band" to those of "space".
 */
__isl_give isl_schedule_band *isl_schedule_band_align_params(
	__isl_take isl_schedule_band *band, __isl_take isl_space *space)
{
	band = isl_schedule_band_cow(band);
	if (!band || !space)
		goto error;

	band->mupa = isl_multi_union_pw_aff_align_params(band->mupa,
						isl_space_copy(space));
	band->ast_build_options =
		isl_union_set_align_params(band->ast_build_options, space);
	if (!band->mupa || !band->ast_build_options)
		return isl_schedule_band_free(band);

	return band;
error:
	isl_space_free(space);
	isl_schedule_band_free(band);
	return NULL;
}

/* Compute the pullback of "band" by the function represented by "upma".
 * In other words, plug in "upma" in the iteration domains of "band".
 */
__isl_give isl_schedule_band *isl_schedule_band_pullback_union_pw_multi_aff(
	__isl_take isl_schedule_band *band,
	__isl_take isl_union_pw_multi_aff *upma)
{
	band = isl_schedule_band_cow(band);
	if (!band || !upma)
		goto error;

	band->mupa =
		isl_multi_union_pw_aff_pullback_union_pw_multi_aff(band->mupa,
									upma);
	if (!band->mupa)
		return isl_schedule_band_free(band);

	return band;
error:
	isl_union_pw_multi_aff_free(upma);
	isl_schedule_band_free(band);
	return NULL;
}

/* Compute the gist of "band" with respect to "context".
 * In particular, compute the gist of the associated partial schedule.
 */
__isl_give isl_schedule_band *isl_schedule_band_gist(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *context)
{
	if (!band || !context)
		goto error;
	if (band->n == 0) {
		isl_union_set_free(context);
		return band;
	}
	band = isl_schedule_band_cow(band);
	if (!band)
		goto error;
	band->mupa = isl_multi_union_pw_aff_gist(band->mupa, context);
	if (!band->mupa)
		return isl_schedule_band_free(band);
	return band;
error:
	isl_union_set_free(context);
	isl_schedule_band_free(band);
	return NULL;
}

/* AutoSA Extended */
/* Return the space_time property of the scheduling dimension within
 * the band.
 */
enum autosa_loop_type isl_schedule_band_member_get_space_time(
  __isl_keep isl_schedule_band *band, int pos)
{
  if (!band)
    return autosa_loop_error;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position", return autosa_loop_error);

  if (!band->space_time)
    return autosa_loop_error;

  return band->space_time[pos];
}

/* Mark the given scheduling dimension as "loop_type".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_space_time(
  __isl_take isl_schedule_band *band, int pos, enum autosa_loop_type loop_type)
{
  if (!band)
    return NULL;
  band = isl_schedule_band_cow(band);
  if (!band)
    return NULL;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position",
        return isl_schedule_band_free(band));

  if (!band->space_time)
    band->space_time = isl_calloc_array(isl_schedule_band_get_ctx(band), 
			enum autosa_loop_type, band->n);
  band->space_time[pos] = loop_type;

  return band;
}

/* Return the pe_opt property of the scheduling dimension within
 * the band.
 */
enum autosa_loop_type isl_schedule_band_member_get_pe_opt(
  __isl_keep isl_schedule_band *band, int pos)
{
  if (!band)
    return autosa_loop_error;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position", return autosa_loop_error);

  if (!band->pe_opt)
    return autosa_loop_error;

  return band->pe_opt[pos];
}

/* Mark the given scheduling dimension as "loop_type".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_pe_opt(
  __isl_take isl_schedule_band *band, int pos, enum autosa_loop_type loop_type)
{
  if (!band)
    return NULL;
  band = isl_schedule_band_cow(band);
  if (!band)
    return NULL;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position",
        return isl_schedule_band_free(band));

  if (!band->pe_opt)
    band->pe_opt = isl_calloc_array(isl_schedule_band_get_ctx(band), 
			enum autosa_loop_type, band->n);
  band->pe_opt[pos] = loop_type;

  return band;
}

/* Return the sched_pos property of the scheduling dimension within the band.
 */
int isl_schedule_band_member_get_sched_pos(
	__isl_keep isl_schedule_band *band, int pos)
{
	if (!band)
		return -1;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position", return -1);

	if (!band->sched_pos)
		return -1;

	return band->sched_pos[pos];
}

/* Mark the given scheduling dimension as "sched_pos".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_sched_pos(
		__isl_take isl_schedule_band *band, int pos, int sched_pos)
{
	if (!band)
		return NULL;
	band = isl_schedule_band_cow(band);
	if (!band)
    return NULL;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position",
        return isl_schedule_band_free(band));

  if (!band->sched_pos)
    band->sched_pos = isl_calloc_array(isl_schedule_band_get_ctx(band), 
			int, band->n);
  band->sched_pos[pos] = sched_pos;

  return band;	
}

/* Return the iter property of the scheduling dimension within the band.
 */
void *isl_schedule_band_member_get_iter(
	__isl_keep isl_schedule_band *band, int pos)
{
	if (!band)
		return NULL;

	if (pos < 0 || pos >= band->n)
		isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position", return NULL);

	if (!band->iter)
		return NULL;

	return band->iter[pos];
}

/* Mark the given scheduling dimension as "iter".
 */
__isl_give isl_schedule_band *isl_schedule_band_member_set_iter(
		__isl_take isl_schedule_band *band, int pos, void *iter)
{
	if (!band)
		return NULL;
	band = isl_schedule_band_cow(band);
	if (!band)
    return NULL;

  if (pos < 0 || pos >= band->n)
    isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
        "invalid member position",
        return isl_schedule_band_free(band));

  //if (!band->iter)
  //  band->iter = isl_calloc_array(isl_schedule_band_get_ctx(band), 
	//		void, band->n);
  if (pos > 20) 
	isl_die(isl_schedule_band_get_ctx(band), isl_error_invalid,
		"maximal band dim 20 surpassed",
		return isl_schedule_band_free(band));
  band->iter[pos] = iter;

  return band;	
}
/* AutoSA Extended */

================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule_band.h
================================================
#ifndef ISL_SCHEDULE_BAND_H
#define ISL_SCHEDULE_BAND_H

#include <isl/aff.h>
#include <isl/ast_type.h>
#include <isl/union_map.h>

/* Information about a band within a schedule.
 *
 * n is the number of scheduling dimensions within the band.
 * coincident is an array of length n, indicating whether a scheduling dimension
 *	satisfies the coincidence constraints in the sense that
 *	the corresponding dependence distances are zero.
 * permutable is set if the band is permutable.
 * mupa is the partial schedule corresponding to this band.  The dimension
 *	of mupa is equal to n.
 * loop_type contains the loop AST generation types for the members
 * in the band.  It may be NULL, if all members are
 * of type isl_ast_loop_default.
 * isolate_loop_type contains the loop AST generation types for the members
 * in the band for the isolated part.  It may be NULL, if all members are
 * of type isl_ast_loop_default.
 * ast_build_options are the remaining AST build options associated
 * to the band.
 * anchored is set if the node depends on its position in the schedule tree.
 *	In particular, it is set if the AST build options include
 *	an isolate option.
 */
struct isl_schedule_band {
	int ref;

	int n;
	int *coincident;
	int permutable;

	isl_multi_union_pw_aff *mupa;

	int anchored;
	isl_union_set *ast_build_options;
	enum isl_ast_loop_type *loop_type;
	enum isl_ast_loop_type *isolate_loop_type;

	/* AutoSA Extended */
	enum autosa_loop_type *space_time;
	enum autosa_loop_type *pe_opt;
	int *sched_pos;
	void *iter[20];
	/* AutoSA Extended */
};
typedef struct isl_schedule_band isl_schedule_band;

__isl_give isl_schedule_band *isl_schedule_band_from_multi_union_pw_aff(
	__isl_take isl_multi_union_pw_aff *mupa);
__isl_give isl_schedule_band *isl_schedule_band_copy(
	__isl_keep isl_schedule_band *band);
__isl_null isl_schedule_band *isl_schedule_band_free(
	__isl_take isl_schedule_band *band);

isl_ctx *isl_schedule_band_get_ctx(__isl_keep isl_schedule_band *band);

isl_bool isl_schedule_band_plain_is_equal(__isl_keep isl_schedule_band *band1,
	__isl_keep isl_schedule_band *band2);

int isl_schedule_band_is_anchored(__isl_keep isl_schedule_band *band);

__isl_give isl_space *isl_schedule_band_get_space(
	__isl_keep isl_schedule_band *band);
__isl_give isl_schedule_band *isl_schedule_band_intersect_domain(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *domain);
__isl_give isl_multi_union_pw_aff *isl_schedule_band_get_partial_schedule(
	__isl_keep isl_schedule_band *band);
__isl_give isl_schedule_band *isl_schedule_band_set_partial_schedule(
	__isl_take isl_schedule_band *band,
	__isl_take isl_multi_union_pw_aff *schedule);
enum isl_ast_loop_type isl_schedule_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_ast_loop_type(
	__isl_take isl_schedule_band *band, int pos,
	enum isl_ast_loop_type type);
enum isl_ast_loop_type isl_schedule_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *
isl_schedule_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_band *band, int pos,
	enum isl_ast_loop_type type);
__isl_give isl_union_set *isl_schedule_band_get_ast_build_options(
	__isl_keep isl_schedule_band *band);
__isl_give isl_schedule_band *isl_schedule_band_set_ast_build_options(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *options);
__isl_give isl_set *isl_schedule_band_get_ast_isolate_option(
	__isl_keep isl_schedule_band *band, int depth);
__isl_give isl_schedule_band *isl_schedule_band_replace_ast_build_option(
	__isl_take isl_schedule_band *band, __isl_take isl_set *drop,
	__isl_take isl_set *add);

isl_size isl_schedule_band_n_member(__isl_keep isl_schedule_band *band);
isl_bool isl_schedule_band_member_get_coincident(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_coincident(
	__isl_take isl_schedule_band *band, int pos, int coincident);
isl_bool isl_schedule_band_get_permutable(__isl_keep isl_schedule_band *band);
__isl_give isl_schedule_band *isl_schedule_band_set_permutable(
	__isl_take isl_schedule_band *band, int permutable);

__isl_give isl_schedule_band *isl_schedule_band_scale(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_band *isl_schedule_band_scale_down(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_band *isl_schedule_band_mod(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_band *isl_schedule_band_tile(
	__isl_take isl_schedule_band *band, __isl_take isl_multi_val *sizes);
__isl_give isl_schedule_band *isl_schedule_band_point(
	__isl_take isl_schedule_band *band, __isl_keep isl_schedule_band *tile,
	__isl_take isl_multi_val *sizes);
__isl_give isl_schedule_band *isl_schedule_band_shift(
	__isl_take isl_schedule_band *band,
	__isl_take isl_multi_union_pw_aff *shift);
__isl_give isl_schedule_band *isl_schedule_band_drop(
	__isl_take isl_schedule_band *band, int pos, int n);
__isl_give isl_schedule_band *isl_schedule_band_gist(
	__isl_take isl_schedule_band *band, __isl_take isl_union_set *context);

__isl_give isl_schedule_band *isl_schedule_band_reset_user(
	__isl_take isl_schedule_band *band);
__isl_give isl_schedule_band *isl_schedule_band_align_params(
	__isl_take isl_schedule_band *band, __isl_take isl_space *space);
__isl_give isl_schedule_band *isl_schedule_band_pullback_union_pw_multi_aff(
	__isl_take isl_schedule_band *band,
	__isl_take isl_union_pw_multi_aff *upma);

/* AutoSA Extended */
enum autosa_loop_type isl_schedule_band_member_get_space_time(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_space_time(
	__isl_take isl_schedule_band *band, int pos, enum autosa_loop_type loop_type);
enum autosa_loop_type isl_schedule_band_member_get_pe_opt(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_pe_opt(
	__isl_take isl_schedule_band *band, int pos, enum autosa_loop_type loop_type);
int isl_schedule_band_member_get_sched_pos(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_sched_pos(
	__isl_take isl_schedule_band *band, int pos, int sched_pos);
void *isl_schedule_band_member_get_iter(
	__isl_keep isl_schedule_band *band, int pos);
__isl_give isl_schedule_band *isl_schedule_band_member_set_iter(
	__isl_take isl_schedule_band *band, int pos, void *iter);	
/* AutoSA Extended */

#endif


================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule_node.c
================================================
/*
 * Copyright 2013-2014 Ecole Normale Superieure
 * Copyright 2014      INRIA Rocquencourt
 * Copyright 2016      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 * and Inria Paris - Rocquencourt, Domaine de Voluceau - Rocquencourt,
 * B.P. 105 - 78153 Le Chesnay, France
 */

#include <isl/id.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/set.h>
#include <isl/ast_type.h>
#include <isl_schedule_band.h>
#include <isl_schedule_private.h>
#include <isl_schedule_node_private.h>

/* Create a new schedule node in the given schedule, point at the given
 * tree with given ancestors and child positions.
 * "child_pos" may be NULL if there are no ancestors.
 */
__isl_give isl_schedule_node *isl_schedule_node_alloc(
	__isl_take isl_schedule *schedule, __isl_take isl_schedule_tree *tree,
	__isl_take isl_schedule_tree_list *ancestors, int *child_pos)
{
	isl_ctx *ctx;
	isl_schedule_node *node;
	int i;
	isl_size n;

	n = isl_schedule_tree_list_n_schedule_tree(ancestors);
	if (!schedule || !tree || n < 0)
		goto error;
	if (n > 0 && !child_pos)
		goto error;
	ctx = isl_schedule_get_ctx(schedule);
	node = isl_calloc_type(ctx, isl_schedule_node);
	if (!node)
		goto error;
	node->ref = 1;
	node->schedule = schedule;
	node->tree = tree;
	node->ancestors = ancestors;
	node->child_pos = isl_alloc_array(ctx, int, n);
	if (n && !node->child_pos)
		return isl_schedule_node_free(node);
	for (i = 0; i < n; ++i)
		node->child_pos[i] = child_pos[i];

	return node;
error:
	isl_schedule_free(schedule);
	isl_schedule_tree_free(tree);
	isl_schedule_tree_list_free(ancestors);
	return NULL;
}

/* Return a pointer to the root of a schedule tree with as single
 * node a domain node with the given domain.
 */
__isl_give isl_schedule_node *isl_schedule_node_from_domain(
	__isl_take isl_union_set *domain)
{
	isl_schedule *schedule;
	isl_schedule_node *node;

	schedule = isl_schedule_from_domain(domain);
	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);

	return node;
}

/* Return a pointer to the root of a schedule tree with as single
 * node a extension node with the given extension.
 */
__isl_give isl_schedule_node *isl_schedule_node_from_extension(
	__isl_take isl_union_map *extension)
{
	isl_ctx *ctx;
	isl_schedule *schedule;
	isl_schedule_tree *tree;
	isl_schedule_node *node;

	if (!extension)
		return NULL;

	ctx = isl_union_map_get_ctx(extension);
	tree = isl_schedule_tree_from_extension(extension);
	schedule = isl_schedule_from_schedule_tree(ctx, tree);
	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);

	return node;
}

/* Return the isl_ctx to which "node" belongs.
 */
isl_ctx *isl_schedule_node_get_ctx(__isl_keep isl_schedule_node *node)
{
	return node ? isl_schedule_get_ctx(node->schedule) : NULL;
}

/* Return a pointer to the leaf of the schedule into which "node" points.
 */
__isl_keep isl_schedule_tree *isl_schedule_node_peek_leaf(
	__isl_keep isl_schedule_node *node)
{
	return node ? isl_schedule_peek_leaf(node->schedule) : NULL;
}

/* Return a copy of the leaf of the schedule into which "node" points.
 */
__isl_give isl_schedule_tree *isl_schedule_node_get_leaf(
	__isl_keep isl_schedule_node *node)
{
	return isl_schedule_tree_copy(isl_schedule_node_peek_leaf(node));
}

/* Return the type of the node or isl_schedule_node_error on error.
 */
enum isl_schedule_node_type isl_schedule_node_get_type(
	__isl_keep isl_schedule_node *node)
{
	return node ? isl_schedule_tree_get_type(node->tree)
		    : isl_schedule_node_error;
}

/* Return the type of the parent of "node" or isl_schedule_node_error on error.
 */
enum isl_schedule_node_type isl_schedule_node_get_parent_type(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	int pos;
	int has_parent;
	isl_schedule_tree *parent;
	enum isl_schedule_node_type type;

	if (!node)
		return isl_schedule_node_error;
	has_parent = isl_schedule_node_has_parent(node);
	if (has_parent < 0)
		return isl_schedule_node_error;
	if (!has_parent)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no parent", return isl_schedule_node_error);
	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_schedule_node_error;

	pos = n - 1;
	parent = isl_schedule_tree_list_get_schedule_tree(node->ancestors, pos);
	type = isl_schedule_tree_get_type(parent);
	isl_schedule_tree_free(parent);

	return type;
}

/* Return a copy of the subtree that this node points to.
 */
__isl_give isl_schedule_tree *isl_schedule_node_get_tree(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_copy(node->tree);
}

/* Return a copy of the schedule into which "node" points.
 */
__isl_give isl_schedule *isl_schedule_node_get_schedule(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;
	return isl_schedule_copy(node->schedule);
}

/* Return a fresh copy of "node".
 */
__isl_take isl_schedule_node *isl_schedule_node_dup(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_node_alloc(isl_schedule_copy(node->schedule),
				isl_schedule_tree_copy(node->tree),
				isl_schedule_tree_list_copy(node->ancestors),
				node->child_pos);
}

/* Return an isl_schedule_node that is equal to "node" and that has only
 * a single reference.
 */
__isl_give isl_schedule_node *isl_schedule_node_cow(
	__isl_take isl_schedule_node *node)
{
	if (!node)
		return NULL;

	if (node->ref == 1)
		return node;
	node->ref--;
	return isl_schedule_node_dup(node);
}

/* Return a new reference to "node".
 */
__isl_give isl_schedule_node *isl_schedule_node_copy(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	node->ref++;
	return node;
}

/* Free "node" and return NULL.
 */
__isl_null isl_schedule_node *isl_schedule_node_free(
	__isl_take isl_schedule_node *node)
{
	if (!node)
		return NULL;
	if (--node->ref > 0)
		return NULL;

	isl_schedule_tree_list_free(node->ancestors);
	free(node->child_pos);
	isl_schedule_tree_free(node->tree);
	isl_schedule_free(node->schedule);
	free(node);

	return NULL;
}

/* Do "node1" and "node2" point to the same position in the same
 * schedule?
 */
isl_bool isl_schedule_node_is_equal(__isl_keep isl_schedule_node *node1,
	__isl_keep isl_schedule_node *node2)
{
	int i;
	isl_size n1, n2;

	if (!node1 || !node2)
		return isl_bool_error;
	if (node1 == node2)
		return isl_bool_true;
	if (node1->schedule != node2->schedule)
		return isl_bool_false;

	n1 = isl_schedule_node_get_tree_depth(node1);
	n2 = isl_schedule_node_get_tree_depth(node2);
	if (n1 < 0 || n2 < 0)
		return isl_bool_error;
	if (n1 != n2)
		return isl_bool_false;
	for (i = 0; i < n1; ++i)
		if (node1->child_pos[i] != node2->child_pos[i])
			return isl_bool_false;

	return isl_bool_true;
}

/* Return the number of outer schedule dimensions of "node"
 * in its schedule tree.
 *
 * Return isl_size_error on error.
 */
isl_size isl_schedule_node_get_schedule_depth(
	__isl_keep isl_schedule_node *node)
{
	int i;
	isl_size n;
	int depth = 0;

	if (!node)
		return isl_size_error;

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_size_error;
	for (i = n - 1; i >= 0; --i) {
		isl_schedule_tree *tree;
		isl_size n;

		tree = isl_schedule_tree_list_get_schedule_tree(
						    node->ancestors, i);
		if (!tree)
			return isl_size_error;
		n = 0;
		if (tree->type == isl_schedule_node_band)
			n = isl_schedule_tree_band_n_member(tree);
		depth += n;
		isl_schedule_tree_free(tree);
		if (n < 0)
			return isl_size_error;
	}

	return depth;
}

/* Internal data structure for
 * isl_schedule_node_get_prefix_schedule_union_pw_multi_aff
 *
 * "initialized" is set if the filter field has been initialized.
 * If "universe_domain" is not set, then the collected filter is intersected
 * with the domain of the root domain node.
 * "universe_filter" is set if we are only collecting the universes of filters
 * "collect_prefix" is set if we are collecting prefixes.
 * "filter" collects all outer filters and is NULL until "initialized" is set.
 * "prefix" collects all outer band partial schedules (if "collect_prefix"
 * is set).  If it is used, then it is initialized by the caller
 * of collect_filter_prefix to a zero-dimensional function.
 */
struct isl_schedule_node_get_filter_prefix_data {
	int initialized;
	int universe_domain;
	int universe_filter;
	int collect_prefix;
	isl_union_set *filter;
	isl_multi_union_pw_aff *prefix;
};

static isl_stat collect_filter_prefix(__isl_keep isl_schedule_tree_list *list,
	int n, struct isl_schedule_node_get_filter_prefix_data *data);

/* Update the filter and prefix information in "data" based on the first "n"
 * elements in "list" and the expansion tree root "tree".
 *
 * We first collect the information from the elements in "list",
 * initializing the filter based on the domain of the expansion.
 * Then we map the results to the expanded space and combined them
 * with the results already in "data".
 */
static isl_stat collect_filter_prefix_expansion(
	__isl_take isl_schedule_tree *tree,
	__isl_keep isl_schedule_tree_list *list, int n,
	struct isl_schedule_node_get_filter_prefix_data *data)
{
	struct isl_schedule_node_get_filter_prefix_data contracted;
	isl_union_pw_multi_aff *c;
	isl_union_map *exp, *universe;
	isl_union_set *filter;

	c = isl_schedule_tree_expansion_get_contraction(tree);
	exp = isl_schedule_tree_expansion_get_expansion(tree);

	contracted.initialized = 1;
	contracted.universe_domain = data->universe_domain;
	contracted.universe_filter = data->universe_filter;
	contracted.collect_prefix = data->collect_prefix;
	universe = isl_union_map_universe(isl_union_map_copy(exp));
	filter = isl_union_map_domain(universe);
	if (data->collect_prefix) {
		isl_space *space = isl_union_set_get_space(filter);
		space = isl_space_set_from_params(space);
		contracted.prefix = isl_multi_union_pw_aff_zero(space);
	}
	contracted.filter = filter;

	if (collect_filter_prefix(list, n, &contracted) < 0)
		contracted.filter = isl_union_set_free(contracted.filter);
	if (data->collect_prefix) {
		isl_multi_union_pw_aff *prefix;

		prefix = contracted.prefix;
		prefix =
		    isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
						isl_union_pw_multi_aff_copy(c));
		data->prefix = isl_multi_union_pw_aff_flat_range_product(
						prefix, data->prefix);
	}
	filter = contracted.filter;
	if (data->universe_domain)
		filter = isl_union_set_preimage_union_pw_multi_aff(filter,
						isl_union_pw_multi_aff_copy(c));
	else
		filter = isl_union_set_apply(filter, isl_union_map_copy(exp));
	if (!data->initialized)
		data->filter = filter;
	else
		data->filter = isl_union_set_intersect(filter, data->filter);
	data->initialized = 1;

	isl_union_pw_multi_aff_free(c);
	isl_union_map_free(exp);
	isl_schedule_tree_free(tree);

	return isl_stat_ok;
}

/* Update the filter information in "data" based on the first "n"
 * elements in "list" and the extension tree root "tree", in case
 * data->universe_domain is set and data->collect_prefix is not.
 *
 * We collect the universe domain of the elements in "list" and
 * add it to the universe range of the extension (intersected
 * with the already collected filter, if any).
 */
static isl_stat collect_universe_domain_extension(
	__isl_take isl_schedule_tree *tree,
	__isl_keep isl_schedule_tree_list *list, int n,
	struct isl_schedule_node_get_filter_prefix_data *data)
{
	struct isl_schedule_node_get_filter_prefix_data data_outer;
	isl_union_map *extension;
	isl_union_set *filter;

	data_outer.initialized = 0;
	data_outer.universe_domain = 1;
	data_outer.universe_filter = data->universe_filter;
	data_outer.collect_prefix = 0;
	data_outer.filter = NULL;
	data_outer.prefix = NULL;

	if (collect_filter_prefix(list, n, &data_outer) < 0)
		data_outer.filter = isl_union_set_free(data_outer.filter);

	extension = isl_schedule_tree_extension_get_extension(tree);
	extension = isl_union_map_universe(extension);
	filter = isl_union_map_range(extension);
	if (data_outer.initialized)
		filter = isl_union_set_union(filter, data_outer.filter);
	if (data->initialized)
		filter = isl_union_set_intersect(filter, data->filter);

	data->filter = filter;

	isl_schedule_tree_free(tree);

	return isl_stat_ok;
}

/* Update "data" based on the tree node "tree" in case "data" has
 * not been initialized yet.
 *
 * Return 0 on success and -1 on error.
 *
 * If "tree" is a filter, then we set data->filter to this filter
 * (or its universe).
 * If "tree" is a domain, then this means we have reached the root
 * of the schedule tree without being able to extract any information.
 * We therefore initialize data->filter to the universe of the domain,
 * or the domain itself if data->universe_domain is not set.
 * If "tree" is a band with at least one member, then we set data->filter
 * to the universe of the schedule domain and replace the zero-dimensional
 * data->prefix by the band schedule (if data->collect_prefix is set).
 */
static isl_stat collect_filter_prefix_init(__isl_keep isl_schedule_tree *tree,
	struct isl_schedule_node_get_filter_prefix_data *data)
{
	enum isl_schedule_node_type type;
	isl_multi_union_pw_aff *mupa;
	isl_union_set *filter;
	isl_size n;

	type = isl_schedule_tree_get_type(tree);
	switch (type) {
	case isl_schedule_node_error:
		return isl_stat_error;
	case isl_schedule_node_expansion:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"should be handled by caller", return isl_stat_error);
	case isl_schedule_node_extension:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"cannot handle extension nodes", return isl_stat_error);
	case isl_schedule_node_context:
	case isl_schedule_node_leaf:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		return isl_stat_ok;
	case isl_schedule_node_domain:
		filter = isl_schedule_tree_domain_get_domain(tree);
		if (data->universe_domain)
			filter = isl_union_set_universe(filter);
		data->filter = filter;
		break;
	case isl_schedule_node_band:
		n = isl_schedule_tree_band_n_member(tree);
		if (n < 0)
			return isl_stat_error;
		if (n == 0)
			return isl_stat_ok;
		mupa = isl_schedule_tree_band_get_partial_schedule(tree);
		if (data->collect_prefix) {
			isl_multi_union_pw_aff_free(data->prefix);
			mupa = isl_multi_union_pw_aff_reset_tuple_id(mupa,
								isl_dim_set);
			data->prefix = isl_multi_union_pw_aff_copy(mupa);
		}
		filter = isl_multi_union_pw_aff_domain(mupa);
		filter = isl_union_set_universe(filter);
		data->filter = filter;
		break;
	case isl_schedule_node_filter:
		filter = isl_schedule_tree_filter_get_filter(tree);
		if (data->universe_filter)
			filter = isl_union_set_universe(filter);
		data->filter = filter;
		break;
	}

	if ((data->collect_prefix && !data->prefix) || !data->filter)
		return isl_stat_error;

	data->initialized = 1;

	return isl_stat_ok;
}

/* Update "data" based on the tree node "tree" in case "data" has
 * already been initialized.
 *
 * Return 0 on success and -1 on error.
 *
 * If "tree" is a domain and data->universe_domain is not set, then
 * intersect data->filter with the domain.
 * If "tree" is a filter, then we intersect data->filter with this filter
 * (or its universe).
 * If "tree" is a band with at least one member and data->collect_prefix
 * is set, then we extend data->prefix with the band schedule.
 * If "tree" is an extension, then we make sure that we are not collecting
 * information on any extended domain elements.
 */
static isl_stat collect_filter_prefix_update(__isl_keep isl_schedule_tree *tree,
	struct isl_schedule_node_get_filter_prefix_data *data)
{
	enum isl_schedule_node_type type;
	isl_multi_union_pw_aff *mupa;
	isl_union_set *filter;
	isl_union_map *extension;
	isl_bool empty;
	isl_size n;

	type = isl_schedule_tree_get_type(tree);
	switch (type) {
	case isl_schedule_node_error:
		return isl_stat_error;
	case isl_schedule_node_expansion:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"should be handled by caller", return isl_stat_error);
	case isl_schedule_node_extension:
		extension = isl_schedule_tree_extension_get_extension(tree);
		extension = isl_union_map_intersect_range(extension,
					isl_union_set_copy(data->filter));
		empty = isl_union_map_is_empty(extension);
		isl_union_map_free(extension);
		if (empty < 0)
			return isl_stat_error;
		if (empty)
			break;
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"cannot handle extension nodes", return isl_stat_error);
	case isl_schedule_node_context:
	case isl_schedule_node_leaf:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	case isl_schedule_node_domain:
		if (data->universe_domain)
			break;
		filter = isl_schedule_tree_domain_get_domain(tree);
		data->filter = isl_union_set_intersect(data->filter, filter);
		break;
	case isl_schedule_node_band:
		n = isl_schedule_tree_band_n_member(tree);
		if (n < 0)
			return isl_stat_error;
		if (n == 0)
			break;
		if (!data->collect_prefix)
			break;
		mupa = isl_schedule_tree_band_get_partial_schedule(tree);
		data->prefix = isl_multi_union_pw_aff_flat_range_product(mupa,
								data->prefix);
		if (!data->prefix)
			return isl_stat_error;
		break;
	case isl_schedule_node_filter:
		filter = isl_schedule_tree_filter_get_filter(tree);
		if (data->universe_filter)
			filter = isl_union_set_universe(filter);
		data->filter = isl_union_set_intersect(data->filter, filter);
		if (!data->filter)
			return isl_stat_error;
		break;
	}

	return isl_stat_ok;
}

/* Collect filter and/or prefix information from the first "n"
 * elements in "list" (which represent the ancestors of a node).
 * Store the results in "data".
 *
 * Extension nodes are only supported if they do not affect the outcome,
 * i.e., if we are collecting information on non-extended domain elements,
 * or if we are collecting the universe domain (without prefix).
 *
 * Return 0 on success and -1 on error.
 *
 * We traverse the list from innermost ancestor (last element)
 * to outermost ancestor (first element), calling collect_filter_prefix_init
 * on each node as long as we have not been able to extract any information
 * yet and collect_filter_prefix_update afterwards.
 * If we come across an expansion node, then we interrupt the traversal
 * and call collect_filter_prefix_expansion to restart the traversal
 * over the remaining ancestors and to combine the results with those
 * that have already been collected.
 * If we come across an extension node and we are only computing
 * the universe domain, then we interrupt the traversal and call
 * collect_universe_domain_extension to restart the traversal
 * over the remaining ancestors and to combine the results with those
 * that have already been collected.
 * On successful return, data->initialized will be set since the outermost
 * ancestor is a domain node, which always results in an initialization.
 */
static isl_stat collect_filter_prefix(__isl_keep isl_schedule_tree_list *list,
	int n, struct isl_schedule_node_get_filter_prefix_data *data)
{
	int i;

	if (!list)
		return isl_stat_error;

	for (i = n - 1; i >= 0; --i) {
		isl_schedule_tree *tree;
		enum isl_schedule_node_type type;
		isl_stat r;

		tree = isl_schedule_tree_list_get_schedule_tree(list, i);
		if (!tree)
			return isl_stat_error;
		type = isl_schedule_tree_get_type(tree);
		if (type == isl_schedule_node_expansion)
			return collect_filter_prefix_expansion(tree, list, i,
								data);
		if (type == isl_schedule_node_extension &&
		    data->universe_domain && !data->collect_prefix)
			return collect_universe_domain_extension(tree, list, i,
								data);
		if (!data->initialized)
			r = collect_filter_prefix_init(tree, data);
		else
			r = collect_filter_prefix_update(tree, data);
		isl_schedule_tree_free(tree);
		if (r < 0)
			return isl_stat_error;
	}

	return isl_stat_ok;
}

/* Return the concatenation of the partial schedules of all outer band
 * nodes of "node" interesected with all outer filters
 * as an isl_multi_union_pw_aff.
 * None of the ancestors of "node" may be an extension node, unless
 * there is also a filter ancestor that filters out all the extended
 * domain elements.
 *
 * If "node" is pointing at the root of the schedule tree, then
 * there are no domain elements reaching the current node, so
 * we return an empty result.
 *
 * We collect all the filters and partial schedules in collect_filter_prefix
 * and intersect the domain of the combined schedule with the combined filter.
 */
__isl_give isl_multi_union_pw_aff *
isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	isl_space *space;
	struct isl_schedule_node_get_filter_prefix_data data;

	if (!node)
		return NULL;

	space = isl_schedule_get_space(node->schedule);
	space = isl_space_set_from_params(space);
	if (node->tree == node->schedule->root)
		return isl_multi_union_pw_aff_zero(space);

	data.initialized = 0;
	data.universe_domain = 1;
	data.universe_filter = 0;
	data.collect_prefix = 1;
	data.filter = NULL;
	data.prefix = isl_multi_union_pw_aff_zero(space);

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0 || collect_filter_prefix(node->ancestors, n, &data) < 0)
		data.prefix = isl_multi_union_pw_aff_free(data.prefix);

	data.prefix = isl_multi_union_pw_aff_intersect_domain(data.prefix,
								data.filter);

	return data.prefix;
}

/* Return the concatenation of the partial schedules of all outer band
 * nodes of "node" interesected with all outer filters
 * as an isl_union_pw_multi_aff.
 * None of the ancestors of "node" may be an extension node, unless
 * there is also a filter ancestor that filters out all the extended
 * domain elements.
 *
 * If "node" is pointing at the root of the schedule tree, then
 * there are no domain elements reaching the current node, so
 * we return an empty result.
 *
 * We collect all the filters and partial schedules in collect_filter_prefix.
 * The partial schedules are collected as an isl_multi_union_pw_aff.
 * If this isl_multi_union_pw_aff is zero-dimensional, then it does not
 * contain any domain information, so we construct the isl_union_pw_multi_aff
 * result as a zero-dimensional function on the collected filter.
 * Otherwise, we convert the isl_multi_union_pw_aff to
 * an isl_multi_union_pw_aff and intersect the domain with the filter.
 */
__isl_give isl_union_pw_multi_aff *
isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(
	__isl_keep isl_schedule_node *node)
{
	isl_size n, dim;
	isl_space *space;
	isl_union_pw_multi_aff *prefix;
	struct isl_schedule_node_get_filter_prefix_data data;

	if (!node)
		return NULL;

	space = isl_schedule_get_space(node->schedule);
	if (node->tree == node->schedule->root)
		return isl_union_pw_multi_aff_empty(space);

	space = isl_space_set_from_params(space);
	data.initialized = 0;
	data.universe_domain = 1;
	data.universe_filter = 0;
	data.collect_prefix = 1;
	data.filter = NULL;
	data.prefix = isl_multi_union_pw_aff_zero(space);

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0 || collect_filter_prefix(node->ancestors, n, &data) < 0)
		data.prefix = isl_multi_union_pw_aff_free(data.prefix);

	dim = isl_multi_union_pw_aff_dim(data.prefix, isl_dim_set);
	if (dim < 0)
		data.prefix = isl_multi_union_pw_aff_free(data.prefix);
	if (data.prefix && dim == 0) {
		isl_multi_union_pw_aff_free(data.prefix);
		prefix = isl_union_pw_multi_aff_from_domain(data.filter);
	} else {
		prefix =
		    isl_union_pw_multi_aff_from_multi_union_pw_aff(data.prefix);
		prefix = isl_union_pw_multi_aff_intersect_domain(prefix,
								data.filter);
	}

	return prefix;
}

/* Return the concatenation of the partial schedules of all outer band
 * nodes of "node" interesected with all outer filters
 * as an isl_union_map.
 */
__isl_give isl_union_map *isl_schedule_node_get_prefix_schedule_union_map(
	__isl_keep isl_schedule_node *node)
{
	isl_union_pw_multi_aff *upma;

	upma = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
	return isl_union_map_from_union_pw_multi_aff(upma);
}

/* Return the concatenation of the partial schedules of all outer band
 * nodes of "node" intersected with all outer domain constraints.
 * None of the ancestors of "node" may be an extension node, unless
 * there is also a filter ancestor that filters out all the extended
 * domain elements.
 *
 * Essentially, this function intersects the domain of the output
 * of isl_schedule_node_get_prefix_schedule_union_map with the output
 * of isl_schedule_node_get_domain, except that it only traverses
 * the ancestors of "node" once.
 */
__isl_give isl_union_map *isl_schedule_node_get_prefix_schedule_relation(
	__isl_keep isl_schedule_node *node)
{
	isl_size n, dim;
	isl_space *space;
	isl_union_map *prefix;
	struct isl_schedule_node_get_filter_prefix_data data;

	if (!node)
		return NULL;

	space = isl_schedule_get_space(node->schedule);
	if (node->tree == node->schedule->root)
		return isl_union_map_empty(space);

	space = isl_space_set_from_params(space);
	data.initialized = 0;
	data.universe_domain = 0;
	data.universe_filter = 0;
	data.collect_prefix = 1;
	data.filter = NULL;
	data.prefix = isl_multi_union_pw_aff_zero(space);

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0 || collect_filter_prefix(node->ancestors, n, &data) < 0)
		data.prefix = isl_multi_union_pw_aff_free(data.prefix);

	dim = isl_multi_union_pw_aff_dim(data.prefix, isl_dim_set);
	if (dim < 0)
		data.prefix = isl_multi_union_pw_aff_free(data.prefix);
	if (data.prefix && dim == 0) {
		isl_multi_union_pw_aff_free(data.prefix);
		prefix = isl_union_map_from_domain(data.filter);
	} else {
		prefix = isl_union_map_from_multi_union_pw_aff(data.prefix);
		prefix = isl_union_map_intersect_domain(prefix, data.filter);
	}

	return prefix;
}

/* Return the domain elements that reach "node".
 *
 * If "node" is pointing at the root of the schedule tree, then
 * there are no domain elements reaching the current node, so
 * we return an empty result.
 * None of the ancestors of "node" may be an extension node, unless
 * there is also a filter ancestor that filters out all the extended
 * domain elements.
 *
 * Otherwise, we collect all filters reaching the node,
 * intersected with the root domain in collect_filter_prefix.
 */
__isl_give isl_union_set *isl_schedule_node_get_domain(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	struct isl_schedule_node_get_filter_prefix_data data;

	if (!node)
		return NULL;

	if (node->tree == node->schedule->root) {
		isl_space *space;

		space = isl_schedule_get_space(node->schedule);
		return isl_union_set_empty(space);
	}

	data.initialized = 0;
	data.universe_domain = 0;
	data.universe_filter = 0;
	data.collect_prefix = 0;
	data.filter = NULL;
	data.prefix = NULL;

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0 || collect_filter_prefix(node->ancestors, n, &data) < 0)
		data.filter = isl_union_set_free(data.filter);

	return data.filter;
}

/* Return the union of universe sets of the domain elements that reach "node".
 *
 * If "node" is pointing at the root of the schedule tree, then
 * there are no domain elements reaching the current node, so
 * we return an empty result.
 *
 * Otherwise, we collect the universes of all filters reaching the node
 * in collect_filter_prefix.
 */
__isl_give isl_union_set *isl_schedule_node_get_universe_domain(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	struct isl_schedule_node_get_filter_prefix_data data;

	if (!node)
		return NULL;

	if (node->tree == node->schedule->root) {
		isl_space *space;

		space = isl_schedule_get_space(node->schedule);
		return isl_union_set_empty(space);
	}

	data.initialized = 0;
	data.universe_domain = 1;
	data.universe_filter = 1;
	data.collect_prefix = 0;
	data.filter = NULL;
	data.prefix = NULL;

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0 || collect_filter_prefix(node->ancestors, n, &data) < 0)
		data.filter = isl_union_set_free(data.filter);

	return data.filter;
}

/* Return the subtree schedule of "node".
 *
 * Since isl_schedule_tree_get_subtree_schedule_union_map does not handle
 * trees that do not contain any schedule information, we first
 * move down to the first relevant descendant and handle leaves ourselves.
 *
 * If the subtree rooted at "node" contains any expansion nodes, then
 * the returned subtree schedule is formulated in terms of the expanded
 * domains.
 * The subtree is not allowed to contain any extension nodes.
 */
__isl_give isl_union_map *isl_schedule_node_get_subtree_schedule_union_map(
	__isl_keep isl_schedule_node *node)
{
	isl_schedule_tree *tree, *leaf;
	isl_union_map *umap;

	tree = isl_schedule_node_get_tree(node);
	leaf = isl_schedule_node_peek_leaf(node);
	tree = isl_schedule_tree_first_schedule_descendant(tree, leaf);
	if (!tree)
		return NULL;
	if (tree == leaf) {
		isl_union_set *domain;
		domain = isl_schedule_node_get_universe_domain(node);
		isl_schedule_tree_free(tree);
		return isl_union_map_from_domain(domain);
	}

	umap = isl_schedule_tree_get_subtree_schedule_union_map(tree);
	isl_schedule_tree_free(tree);
	return umap;
}

/* Return the number of ancestors of "node" in its schedule tree.
 */
isl_size isl_schedule_node_get_tree_depth(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_size_error;
	return isl_schedule_tree_list_n_schedule_tree(node->ancestors);
}

/* Does "node" have a parent?
 *
 * That is, does it point to any node of the schedule other than the root?
 */
isl_bool isl_schedule_node_has_parent(__isl_keep isl_schedule_node *node)
{
	isl_size depth;

	depth = isl_schedule_node_get_tree_depth(node);
	if (depth < 0)
		return isl_bool_error;
	return isl_bool_ok(depth != 0);
}

/* Return the position of "node" among the children of its parent.
 */
isl_size isl_schedule_node_get_child_position(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	isl_bool has_parent;

	if (!node)
		return isl_size_error;
	has_parent = isl_schedule_node_has_parent(node);
	if (has_parent < 0)
		return isl_size_error;
	if (!has_parent)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no parent", return isl_size_error);

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	return n < 0 ? isl_size_error : node->child_pos[n - 1];
}

/* Does the parent (if any) of "node" have any children with a smaller child
 * position than this one?
 */
isl_bool isl_schedule_node_has_previous_sibling(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	isl_bool has_parent;

	if (!node)
		return isl_bool_error;
	has_parent = isl_schedule_node_has_parent(node);
	if (has_parent < 0 || !has_parent)
		return has_parent;

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_bool_error;

	return isl_bool_ok(node->child_pos[n - 1] > 0);
}

/* Does the parent (if any) of "node" have any children with a greater child
 * position than this one?
 */
isl_bool isl_schedule_node_has_next_sibling(__isl_keep isl_schedule_node *node)
{
	isl_size n, n_child;
	isl_bool has_parent;
	isl_schedule_tree *tree;

	if (!node)
		return isl_bool_error;
	has_parent = isl_schedule_node_has_parent(node);
	if (has_parent < 0 || !has_parent)
		return has_parent;

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_bool_error;
	tree = isl_schedule_tree_list_get_schedule_tree(node->ancestors, n - 1);
	n_child = isl_schedule_tree_n_children(tree);
	isl_schedule_tree_free(tree);
	if (n_child < 0)
		return isl_bool_error;

	return isl_bool_ok(node->child_pos[n - 1] + 1 < n_child);
}

/* Does "node" have any children?
 *
 * Any node other than the leaf nodes is considered to have at least
 * one child, even if the corresponding isl_schedule_tree does not
 * have any children.
 */
isl_bool isl_schedule_node_has_children(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;
	return isl_bool_ok(!isl_schedule_tree_is_leaf(node->tree));
}

/* Return the number of children of "node"?
 *
 * Any node other than the leaf nodes is considered to have at least
 * one child, even if the corresponding isl_schedule_tree does not
 * have any children.  That is, the number of children of "node" is
 * only zero if its tree is the explicit empty tree.  Otherwise,
 * if the isl_schedule_tree has any children, then it is equal
 * to the number of children of "node".  If it has zero children,
 * then "node" still has a leaf node as child.
 */
isl_size isl_schedule_node_n_children(__isl_keep isl_schedule_node *node)
{
	isl_size n;

	if (!node)
		return isl_size_error;

	if (isl_schedule_tree_is_leaf(node->tree))
		return 0;

	n = isl_schedule_tree_n_children(node->tree);
	if (n < 0)
		return isl_size_error;
	if (n == 0)
		return 1;

	return n;
}

/* Move the "node" pointer to the ancestor of the given generation
 * of the node it currently points to, where generation 0 is the node
 * itself and generation 1 is its parent.
 */
__isl_give isl_schedule_node *isl_schedule_node_ancestor(
	__isl_take isl_schedule_node *node, int generation)
{
	isl_size n;
	isl_schedule_tree *tree;

	if (!node)
		return NULL;
	if (generation == 0)
		return node;
	n = isl_schedule_node_get_tree_depth(node);
	if (n < 0)
		return isl_schedule_node_free(node);
	if (generation < 0 || generation > n)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"generation out of bounds",
			return isl_schedule_node_free(node));
	node = isl_schedule_node_cow(node);
	if (!node)
		return NULL;

	tree = isl_schedule_tree_list_get_schedule_tree(node->ancestors,
							n - generation);
	isl_schedule_tree_free(node->tree);
	node->tree = tree;
	node->ancestors = isl_schedule_tree_list_drop(node->ancestors,
						    n - generation, generation);
	if (!node->ancestors || !node->tree)
		return isl_schedule_node_free(node);

	return node;
}

/* Move the "node" pointer to the parent of the node it currently points to.
 */
__isl_give isl_schedule_node *isl_schedule_node_parent(
	__isl_take isl_schedule_node *node)
{
	if (!node)
		return NULL;
	if (!isl_schedule_node_has_parent(node))
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no parent",
			return isl_schedule_node_free(node));
	return isl_schedule_node_ancestor(node, 1);
}

/* Move the "node" pointer to the root of its schedule tree.
 */
__isl_give isl_schedule_node *isl_schedule_node_root(
	__isl_take isl_schedule_node *node)
{
	isl_size n;

	if (!node)
		return NULL;
	n = isl_schedule_node_get_tree_depth(node);
	if (n < 0)
		return isl_schedule_node_free(node);
	return isl_schedule_node_ancestor(node, n);
}

/* Move the "node" pointer to the child at position "pos" of the node
 * it currently points to.
 */
__isl_give isl_schedule_node *isl_schedule_node_child(
	__isl_take isl_schedule_node *node, int pos)
{
	isl_size n;
	isl_ctx *ctx;
	isl_schedule_tree *tree;
	int *child_pos;

	node = isl_schedule_node_cow(node);
	if (!node)
		return NULL;
	if (!isl_schedule_node_has_children(node))
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no children",
			return isl_schedule_node_free(node));

	ctx = isl_schedule_node_get_ctx(node);
	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_schedule_node_free(node);
	child_pos = isl_realloc_array(ctx, node->child_pos, int, n + 1);
	if (!child_pos)
		return isl_schedule_node_free(node);
	node->child_pos = child_pos;
	node->child_pos[n] = pos;

	node->ancestors = isl_schedule_tree_list_add(node->ancestors,
				isl_schedule_tree_copy(node->tree));
	tree = node->tree;
	if (isl_schedule_tree_has_children(tree))
		tree = isl_schedule_tree_get_child(tree, pos);
	else
		tree = isl_schedule_node_get_leaf(node);
	isl_schedule_tree_free(node->tree);
	node->tree = tree;

	if (!node->tree || !node->ancestors)
		return isl_schedule_node_free(node);

	return node;
}

/* Move the "node" pointer to the first child of the node
 * it currently points to.
 */
__isl_give isl_schedule_node *isl_schedule_node_first_child(
	__isl_take isl_schedule_node *node)
{
	return isl_schedule_node_child(node, 0);
}

/* Move the "node" pointer to the child of this node's parent in
 * the previous child position.
 */
__isl_give isl_schedule_node *isl_schedule_node_previous_sibling(
	__isl_take isl_schedule_node *node)
{
	isl_size n;
	isl_schedule_tree *parent, *tree;

	node = isl_schedule_node_cow(node);
	if (!node)
		return NULL;
	if (!isl_schedule_node_has_previous_sibling(node))
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no previous sibling",
			return isl_schedule_node_free(node));

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_schedule_node_free(node);
	parent = isl_schedule_tree_list_get_schedule_tree(node->ancestors,
									n - 1);
	if (!parent)
		return isl_schedule_node_free(node);
	node->child_pos[n - 1]--;
	tree = isl_schedule_tree_list_get_schedule_tree(parent->children,
							node->child_pos[n - 1]);
	isl_schedule_tree_free(parent);
	if (!tree)
		return isl_schedule_node_free(node);
	isl_schedule_tree_free(node->tree);
	node->tree = tree;

	return node;
}

/* Move the "node" pointer to the child of this node's parent in
 * the next child position.
 */
__isl_give isl_schedule_node *isl_schedule_node_next_sibling(
	__isl_take isl_schedule_node *node)
{
	isl_size n;
	isl_schedule_tree *parent, *tree;

	node = isl_schedule_node_cow(node);
	if (!node)
		return NULL;
	if (!isl_schedule_node_has_next_sibling(node))
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"node has no next sibling",
			return isl_schedule_node_free(node));

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_schedule_node_free(node);
	parent = isl_schedule_tree_list_get_schedule_tree(node->ancestors,
									n - 1);
	if (!parent)
		return isl_schedule_node_free(node);
	node->child_pos[n - 1]++;
	tree = isl_schedule_tree_list_get_schedule_tree(parent->children,
							node->child_pos[n - 1]);
	isl_schedule_tree_free(parent);
	if (!tree)
		return isl_schedule_node_free(node);
	isl_schedule_tree_free(node->tree);
	node->tree = tree;

	return node;
}

/* Return a copy to the child at position "pos" of "node".
 */
__isl_give isl_schedule_node *isl_schedule_node_get_child(
	__isl_keep isl_schedule_node *node, int pos)
{
	return isl_schedule_node_child(isl_schedule_node_copy(node), pos);
}

/* Traverse the descendant of "node" in depth-first order, including
 * "node" itself.  Call "enter" whenever a node is entered and "leave"
 * whenever a node is left.  The callback "enter" is responsible
 * for moving to the deepest initial subtree of its argument that
 * should be traversed.
 */
static __isl_give isl_schedule_node *traverse(
	__isl_take isl_schedule_node *node,
	__isl_give isl_schedule_node *(*enter)(
		__isl_take isl_schedule_node *node, void *user),
	__isl_give isl_schedule_node *(*leave)(
		__isl_take isl_schedule_node *node, void *user),
	void *user)
{
	isl_size depth;
	isl_size node_depth;

	depth = isl_schedule_node_get_tree_depth(node);
	if (depth < 0)
		return isl_schedule_node_free(node);

	do {
		node = enter(node, user);
		node = leave(node, user);
		while ((node_depth = isl_schedule_node_get_tree_depth(node)) >
				depth &&
				!isl_schedule_node_has_next_sibling(node)) {
			node = isl_schedule_node_parent(node);
			node = leave(node, user);
		}
		if (node_depth < 0)
			return isl_schedule_node_free(node);
		if (node_depth > depth)
			node = isl_schedule_node_next_sibling(node);
	} while (node_depth > depth);

	return node;
}

/* Internal data structure for isl_schedule_node_foreach_descendant_top_down.
 *
 * "fn" is the user-specified callback function.
 * "user" is the user-specified argument for the callback.
 */
struct isl_schedule_node_preorder_data {
	isl_bool (*fn)(__isl_keep isl_schedule_node *node, void *user);
	void *user;
};

/* Callback for "traverse" to enter a node and to move
 * to the deepest initial subtree that should be traversed
 * for use in a preorder visit.
 *
 * If the user callback returns a negative value, then we abort
 * the traversal.  If this callback returns zero, then we skip
 * the subtree rooted at the current node.  Otherwise, we move
 * down to the first child and repeat the process until a leaf
 * is reached.
 */
static __isl_give isl_schedule_node *preorder_enter(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_schedule_node_preorder_data *data = user;

	if (!node)
		return NULL;

	do {
		isl_bool r;

		r = data->fn(node, data->user);
		if (r < 0)
			return isl_schedule_node_free(node);
		if (r == isl_bool_false)
			return node;
	} while (isl_schedule_node_has_children(node) &&
		(node = isl_schedule_node_first_child(node)) != NULL);

	return node;
}

/* Callback for "traverse" to leave a node
 * for use in a preorder visit.
 * Since we already visited the node when we entered it,
 * we do not need to do anything here.
 */
static __isl_give isl_schedule_node *preorder_leave(
	__isl_take isl_schedule_node *node, void *user)
{
	return node;
}

/* Traverse the descendants of "node" (including the node itself)
 * in depth first preorder.
 *
 * If "fn" returns isl_bool_error on any of the nodes,
 * then the traversal is aborted.
 * If "fn" returns isl_bool_false on any of the nodes, then the subtree rooted
 * at that node is skipped.
 *
 * Return isl_stat_ok on success and isl_stat_error on failure.
 */
isl_stat isl_schedule_node_foreach_descendant_top_down(
	__isl_keep isl_schedule_node *node,
	isl_bool (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user)
{
	struct isl_schedule_node_preorder_data data = { fn, user };

	node = isl_schedule_node_copy(node);
	node = traverse(node, &preorder_enter, &preorder_leave, &data);
	isl_schedule_node_free(node);

	return node ? isl_stat_ok : isl_stat_error;
}

/* Internal data structure for isl_schedule_node_every_descendant.
 *
 * "test" is the user-specified callback function.
 * "user" is the user-specified callback function argument.
 *
 * "failed" is initialized to 0 and set to 1 if "test" fails
 * on any node.
 */
struct isl_union_map_every_data {
	isl_bool (*test)(__isl_keep isl_schedule_node *node, void *user);
	void *user;
	int failed;
};

/* isl_schedule_node_foreach_descendant_top_down callback
 * that sets data->failed if data->test returns false and
 * subsequently aborts the traversal.
 */
static isl_bool call_every(__isl_keep isl_schedule_node *node, void *user)
{
	struct isl_union_map_every_data *data = user;
	isl_bool r;

	r = data->test(node, data->user);
	if (r < 0)
		return isl_bool_error;
	if (r)
		return isl_bool_true;
	data->failed = 1;
	return isl_bool_error;
}

/* Does "test" succeed on every descendant of "node" (including "node" itself)?
 */
isl_bool isl_schedule_node_every_descendant(__isl_keep isl_schedule_node *node,
	isl_bool (*test)(__isl_keep isl_schedule_node *node, void *user),
	void *user)
{
	struct isl_union_map_every_data data = { test, user, 0 };
	isl_stat r;

	r = isl_schedule_node_foreach_descendant_top_down(node, &call_every,
							&data);
	if (r >= 0)
		return isl_bool_true;
	if (data.failed)
		return isl_bool_false;
	return isl_bool_error;
}

/* Internal data structure for isl_schedule_node_map_descendant_bottom_up.
 *
 * "fn" is the user-specified callback function.
 * "user" is the user-specified argument for the callback.
 */
struct isl_schedule_node_postorder_data {
	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
		void *user);
	void *user;
};

/* Callback for "traverse" to enter a node and to move
 * to the deepest initial subtree that should be traversed
 * for use in a postorder visit.
 *
 * Since we are performing a postorder visit, we only need
 * to move to the deepest initial leaf here.
 */
static __isl_give isl_schedule_node *postorder_enter(
	__isl_take isl_schedule_node *node, void *user)
{
	while (node && isl_schedule_node_has_children(node))
		node = isl_schedule_node_first_child(node);

	return node;
}

/* Callback for "traverse" to leave a node
 * for use in a postorder visit.
 *
 * Since we are performing a postorder visit, we need
 * to call the user callback here.
 */
static __isl_give isl_schedule_node *postorder_leave(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_schedule_node_postorder_data *data = user;

	return data->fn(node, data->user);
}

/* Traverse the descendants of "node" (including the node itself)
 * in depth first postorder, allowing the user to modify the visited node.
 * The traversal continues from the node returned by the callback function.
 * It is the responsibility of the user to ensure that this does not
 * lead to an infinite loop.  It is safest to always return a pointer
 * to the same position (same ancestors and child positions) as the input node.
 */
__isl_give isl_schedule_node *isl_schedule_node_map_descendant_bottom_up(
	__isl_take isl_schedule_node *node,
	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
		void *user), void *user)
{
	struct isl_schedule_node_postorder_data data = { fn, user };

	return traverse(node, &postorder_enter, &postorder_leave, &data);
}

/* Traverse the ancestors of "node" from the root down to and including
 * the parent of "node", calling "fn" on each of them.
 *
 * If "fn" returns -1 on any of the nodes, then the traversal is aborted.
 *
 * Return 0 on success and -1 on failure.
 */
isl_stat isl_schedule_node_foreach_ancestor_top_down(
	__isl_keep isl_schedule_node *node,
	isl_stat (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user)
{
	int i;
	isl_size n;

	n = isl_schedule_node_get_tree_depth(node);
	if (n < 0)
		return isl_stat_error;

	for (i = 0; i < n; ++i) {
		isl_schedule_node *ancestor;
		isl_stat r;

		ancestor = isl_schedule_node_copy(node);
		ancestor = isl_schedule_node_ancestor(ancestor, n - i);
		r = fn(ancestor, user);
		isl_schedule_node_free(ancestor);
		if (r < 0)
			return isl_stat_error;
	}

	return isl_stat_ok;
}

/* Is any node in the subtree rooted at "node" anchored?
 * That is, do any of these nodes reference the outer band nodes?
 */
isl_bool isl_schedule_node_is_subtree_anchored(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;
	return isl_schedule_tree_is_subtree_anchored(node->tree);
}

/* Return the number of members in the given band node.
 */
isl_size isl_schedule_node_band_n_member(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_size_error;
	return isl_schedule_tree_band_n_member(node->tree);
}

/* Is the band member at position "pos" of the band node "node"
 * marked coincident?
 */
isl_bool isl_schedule_node_band_member_get_coincident(
	__isl_keep isl_schedule_node *node, int pos)
{
	if (!node)
		return isl_bool_error;
	return isl_schedule_tree_band_member_get_coincident(node->tree, pos);
}

/* Mark the band member at position "pos" the band node "node"
 * as being coincident or not according to "coincident".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_coincident(
	__isl_take isl_schedule_node *node, int pos, int coincident)
{
	int c;
	isl_schedule_tree *tree;

	if (!node)
		return NULL;
	c = isl_schedule_node_band_member_get_coincident(node, pos);
	if (c == coincident)
		return node;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_band_member_set_coincident(tree, pos,
							    coincident);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Is the band node "node" marked permutable?
 */
isl_bool isl_schedule_node_band_get_permutable(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;

	return isl_schedule_tree_band_get_permutable(node->tree);
}

/* Mark the band node "node" permutable or not according to "permutable"?
 */
__isl_give isl_schedule_node *isl_schedule_node_band_set_permutable(
	__isl_take isl_schedule_node *node, int permutable)
{
	isl_schedule_tree *tree;

	if (!node)
		return NULL;
	if (isl_schedule_node_band_get_permutable(node) == permutable)
		return node;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_band_set_permutable(tree, permutable);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Return the schedule space of the band node.
 */
__isl_give isl_space *isl_schedule_node_band_get_space(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_band_get_space(node->tree);
}

/* Return the schedule of the band node in isolation.
 */
__isl_give isl_multi_union_pw_aff *isl_schedule_node_band_get_partial_schedule(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_band_get_partial_schedule(node->tree);
}

/* Return the schedule of the band node in isolation in the form of
 * an isl_union_map.
 *
 * If the band does not have any members, then we construct a universe map
 * with the universe of the domain elements reaching the node as domain.
 * Otherwise, we extract an isl_multi_union_pw_aff representation and
 * convert that to an isl_union_map.
 */
__isl_give isl_union_map *isl_schedule_node_band_get_partial_schedule_union_map(
	__isl_keep isl_schedule_node *node)
{
	isl_size n;
	isl_multi_union_pw_aff *mupa;

	if (!node)
		return NULL;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a band node", return NULL);
	n = isl_schedule_node_band_n_member(node);
	if (n < 0)
		return NULL;
	if (n == 0) {
		isl_union_set *domain;

		domain = isl_schedule_node_get_universe_domain(node);
		return isl_union_map_from_domain(domain);
	}

	mupa = isl_schedule_node_band_get_partial_schedule(node);
	return isl_union_map_from_multi_union_pw_aff(mupa);
}

/* Return the loop AST generation type for the band member of band node "node"
 * at position "pos".
 */
enum isl_ast_loop_type isl_schedule_node_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_node *node, int pos)
{
	if (!node)
		return isl_ast_loop_error;

	return isl_schedule_tree_band_member_get_ast_loop_type(node->tree, pos);
}

/* Set the loop AST generation type for the band member of band node "node"
 * at position "pos" to "type".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_ast_loop_type(
	__isl_take isl_schedule_node *node, int pos,
	enum isl_ast_loop_type type)
{
	isl_schedule_tree *tree;

	if (!node)
		return NULL;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_band_member_set_ast_loop_type(tree, pos, type);
	return isl_schedule_node_graft_tree(node, tree);
}

/* Return the loop AST generation type for the band member of band node "node"
 * at position "pos" for the isolated part.
 */
enum isl_ast_loop_type isl_schedule_node_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_node *node, int pos)
{
	if (!node)
		return isl_ast_loop_error;

	return isl_schedule_tree_band_member_get_isolate_ast_loop_type(
							    node->tree, pos);
}

/* Set the loop AST generation type for the band member of band node "node"
 * at position "pos" for the isolated part to "type".
 */
__isl_give isl_schedule_node *
isl_schedule_node_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_node *node, int pos,
	enum isl_ast_loop_type type)
{
	isl_schedule_tree *tree;

	if (!node)
		return NULL;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_band_member_set_isolate_ast_loop_type(tree,
								    pos, type);
	return isl_schedule_node_graft_tree(node, tree);
}

/* Return the AST build options associated to band node "node".
 */
__isl_give isl_union_set *isl_schedule_node_band_get_ast_build_options(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_band_get_ast_build_options(node->tree);
}

/* Replace the AST build options associated to band node "node" by "options".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_set_ast_build_options(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *options)
{
	isl_schedule_tree *tree;

	if (!node || !options)
		goto error;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_band_set_ast_build_options(tree, options);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_schedule_node_free(node);
	isl_union_set_free(options);
	return NULL;
}

/* Return the "isolate" option associated to band node "node".
 */
__isl_give isl_set *isl_schedule_node_band_get_ast_isolate_option(
	__isl_keep isl_schedule_node *node)
{
	isl_size depth;

	depth = isl_schedule_node_get_schedule_depth(node);
	if (depth < 0)
		return NULL;

	return isl_schedule_tree_band_get_ast_isolate_option(node->tree, depth);
}

/* Make sure that that spaces of "node" and "mv" are the same.
 * Return -1 on error, reporting the error to the user.
 */
static int check_space_multi_val(__isl_keep isl_schedule_node *node,
	__isl_keep isl_multi_val *mv)
{
	isl_space *node_space, *mv_space;
	int equal;

	node_space = isl_schedule_node_band_get_space(node);
	mv_space = isl_multi_val_get_space(mv);
	equal = isl_space_tuple_is_equal(node_space, isl_dim_set,
					mv_space, isl_dim_set);
	isl_space_free(mv_space);
	isl_space_free(node_space);
	if (equal < 0)
		return -1;
	if (!equal)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"spaces don't match", return -1);

	return 0;
}

/* Multiply the partial schedule of the band node "node"
 * with the factors in "mv".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_scale(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv)
{
	isl_schedule_tree *tree;
	int anchored;

	if (!node || !mv)
		goto error;
	if (check_space_multi_val(node, mv) < 0)
		goto error;
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot scale band node with anchored subtree",
			goto error);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_scale(tree, mv);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_multi_val_free(mv);
	isl_schedule_node_free(node);
	return NULL;
}

/* Divide the partial schedule of the band node "node"
 * by the factors in "mv".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_scale_down(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv)
{
	isl_schedule_tree *tree;
	int anchored;

	if (!node || !mv)
		goto error;
	if (check_space_multi_val(node, mv) < 0)
		goto error;
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot scale down band node with anchored subtree",
			goto error);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_scale_down(tree, mv);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_multi_val_free(mv);
	isl_schedule_node_free(node);
	return NULL;
}

/* Reduce the partial schedule of the band node "node"
 * modulo the factors in "mv".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_mod(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv)
{
	isl_schedule_tree *tree;
	isl_bool anchored;

	if (!node || !mv)
		goto error;
	if (check_space_multi_val(node, mv) < 0)
		goto error;
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot perform mod on band node with anchored subtree",
			goto error);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_mod(tree, mv);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_multi_val_free(mv);
	isl_schedule_node_free(node);
	return NULL;
}

/* Make sure that that spaces of "node" and "mupa" are the same.
 * Return isl_stat_error on error, reporting the error to the user.
 */
static isl_stat check_space_multi_union_pw_aff(
	__isl_keep isl_schedule_node *node,
	__isl_keep isl_multi_union_pw_aff *mupa)
{
	isl_space *node_space, *mupa_space;
	isl_bool equal;

	node_space = isl_schedule_node_band_get_space(node);
	mupa_space = isl_multi_union_pw_aff_get_space(mupa);
	equal = isl_space_tuple_is_equal(node_space, isl_dim_set,
					mupa_space, isl_dim_set);
	isl_space_free(mupa_space);
	isl_space_free(node_space);
	if (equal < 0)
		return isl_stat_error;
	if (!equal)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"spaces don't match", return isl_stat_error);

	return isl_stat_ok;
}

/* Shift the partial schedule of the band node "node" by "shift".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_shift(
	__isl_take isl_schedule_node *node,
	__isl_take isl_multi_union_pw_aff *shift)
{
	isl_schedule_tree *tree;
	int anchored;

	if (!node || !shift)
		goto error;
	if (check_space_multi_union_pw_aff(node, shift) < 0)
		goto error;
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot shift band node with anchored subtree",
			goto error);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_shift(tree, shift);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_multi_union_pw_aff_free(shift);
	isl_schedule_node_free(node);
	return NULL;
}

/* Tile "node" with tile sizes "sizes".
 *
 * The current node is replaced by two nested nodes corresponding
 * to the tile dimensions and the point dimensions.
 *
 * Return a pointer to the outer (tile) node.
 *
 * If any of the descendants of "node" depend on the set of outer band nodes,
 * then we refuse to tile the node.
 *
 * If the scale tile loops option is set, then the tile loops
 * are scaled by the tile sizes.  If the shift point loops option is set,
 * then the point loops are shifted to start at zero.
 * In particular, these options affect the tile and point loop schedules
 * as follows
 *
 *	scale	shift	original	tile		point
 *
 *	0	0	i		floor(i/s)	i
 *	1	0	i		s * floor(i/s)	i
 *	0	1	i		floor(i/s)	i - s * floor(i/s)
 *	1	1	i		s * floor(i/s)	i - s * floor(i/s)
 */
__isl_give isl_schedule_node *isl_schedule_node_band_tile(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
{
	isl_schedule_tree *tree;
	int anchored;

	if (!node || !sizes)
		goto error;
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot tile band node with anchored subtree",
			goto error);

	if (check_space_multi_val(node, sizes) < 0)
		goto error;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_tile(tree, sizes);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_multi_val_free(sizes);
	isl_schedule_node_free(node);
	return NULL;
}

/* Move the band node "node" down to all the leaves in the subtree
 * rooted at "node".
 * Return a pointer to the node in the resulting tree that is in the same
 * position as the node pointed to by "node" in the original tree.
 *
 * If the node only has a leaf child, then nothing needs to be done.
 * Otherwise, the child of the node is removed and the result is
 * appended to all the leaves in the subtree rooted at the original child.
 * Since the node is moved to the leaves, it needs to be expanded
 * according to the expansion, if any, defined by that subtree.
 * In the end, the original node is replaced by the result of
 * attaching copies of the expanded node to the leaves.
 *
 * If any of the nodes in the subtree rooted at "node" depend on
 * the set of outer band nodes then we refuse to sink the band node.
 */
__isl_give isl_schedule_node *isl_schedule_node_band_sink(
	__isl_take isl_schedule_node *node)
{
	enum isl_schedule_node_type type;
	isl_schedule_tree *tree, *child;
	isl_union_pw_multi_aff *contraction;
	isl_bool anchored;
	isl_size n;

	if (!node)
		return NULL;

	type = isl_schedule_node_get_type(node);
	if (type != isl_schedule_node_band)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a band node", return isl_schedule_node_free(node));
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		return isl_schedule_node_free(node);
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot sink band node in anchored subtree",
			return isl_schedule_node_free(node));
	n = isl_schedule_tree_n_children(node->tree);
	if (n < 0)
		return isl_schedule_node_free(node);
	if (n == 0)
		return node;

	contraction = isl_schedule_node_get_subtree_contraction(node);

	tree = isl_schedule_node_get_tree(node);
	child = isl_schedule_tree_get_child(tree, 0);
	tree = isl_schedule_tree_reset_children(tree);
	tree = isl_schedule_tree_pullback_union_pw_multi_aff(tree, contraction);
	tree = isl_schedule_tree_append_to_leaves(child, tree);

	return isl_schedule_node_graft_tree(node, tree);
}

/* Split "node" into two nested band nodes, one with the first "pos"
 * dimensions and one with the remaining dimensions.
 * The schedules of the two band nodes live in anonymous spaces.
 * The loop AST generation type options and the isolate option
 * are split over the two band nodes.
 */
__isl_give isl_schedule_node *isl_schedule_node_band_split(
	__isl_take isl_schedule_node *node, int pos)
{
	isl_size depth;
	isl_schedule_tree *tree;

	depth = isl_schedule_node_get_schedule_depth(node);
	if (depth < 0)
		return isl_schedule_node_free(node);
	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_split(tree, pos, depth);
	return isl_schedule_node_graft_tree(node, tree);
}

/* Return the context of the context node "node".
 */
__isl_give isl_set *isl_schedule_node_context_get_context(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_context_get_context(node->tree);
}

/* Return the domain of the domain node "node".
 */
__isl_give isl_union_set *isl_schedule_node_domain_get_domain(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_domain_get_domain(node->tree);
}

/* Return the expansion map of expansion node "node".
 */
__isl_give isl_union_map *isl_schedule_node_expansion_get_expansion(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_expansion_get_expansion(node->tree);
}

/* Return the contraction of expansion node "node".
 */
__isl_give isl_union_pw_multi_aff *isl_schedule_node_expansion_get_contraction(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_expansion_get_contraction(node->tree);
}

/* Replace the contraction and the expansion of the expansion node "node"
 * by "contraction" and "expansion".
 */
__isl_give isl_schedule_node *
isl_schedule_node_expansion_set_contraction_and_expansion(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion)
{
	isl_schedule_tree *tree;

	if (!node || !contraction || !expansion)
		goto error;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_expansion_set_contraction_and_expansion(tree,
							contraction, expansion);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_schedule_node_free(node);
	isl_union_pw_multi_aff_free(contraction);
	isl_union_map_free(expansion);
	return NULL;
}

/* Return the extension of the extension node "node".
 */
__isl_give isl_union_map *isl_schedule_node_extension_get_extension(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_extension_get_extension(node->tree);
}

/* Replace the extension of extension node "node" by "extension".
 */
__isl_give isl_schedule_node *isl_schedule_node_extension_set_extension(
	__isl_take isl_schedule_node *node, __isl_take isl_union_map *extension)
{
	isl_schedule_tree *tree;

	if (!node || !extension)
		goto error;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_extension_set_extension(tree, extension);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_schedule_node_free(node);
	isl_union_map_free(extension);
	return NULL;
}

/* Return the filter of the filter node "node".
 */
__isl_give isl_union_set *isl_schedule_node_filter_get_filter(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_filter_get_filter(node->tree);
}

/* Replace the filter of filter node "node" by "filter".
 */
__isl_give isl_schedule_node *isl_schedule_node_filter_set_filter(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter)
{
	isl_schedule_tree *tree;

	if (!node || !filter)
		goto error;

	tree = isl_schedule_tree_copy(node->tree);
	tree = isl_schedule_tree_filter_set_filter(tree, filter);
	return isl_schedule_node_graft_tree(node, tree);
error:
	isl_schedule_node_free(node);
	isl_union_set_free(filter);
	return NULL;
}

/* Intersect the filter of filter node "node" with "filter".
 *
 * If the filter of the node is already a subset of "filter",
 * then leave the node unchanged.
 */
__isl_give isl_schedule_node *isl_schedule_node_filter_intersect_filter(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter)
{
	isl_union_set *node_filter = NULL;
	isl_bool subset;

	if (!node || !filter)
		goto error;

	node_filter = isl_schedule_node_filter_get_filter(node);
	subset = isl_union_set_is_subset(node_filter, filter);
	if (subset < 0)
		goto error;
	if (subset) {
		isl_union_set_free(node_filter);
		isl_union_set_free(filter);
		return node;
	}
	node_filter = isl_union_set_intersect(node_filter, filter);
	node = isl_schedule_node_filter_set_filter(node, node_filter);
	return node;
error:
	isl_schedule_node_free(node);
	isl_union_set_free(node_filter);
	isl_union_set_free(filter);
	return NULL;
}

/* Return the guard of the guard node "node".
 */
__isl_give isl_set *isl_schedule_node_guard_get_guard(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_guard_get_guard(node->tree);
}

/* Return the mark identifier of the mark node "node".
 */
__isl_give isl_id *isl_schedule_node_mark_get_id(
	__isl_keep isl_schedule_node *node)
{
	if (!node)
		return NULL;

	return isl_schedule_tree_mark_get_id(node->tree);
}

/* Replace the child at position "pos" of the sequence node "node"
 * by the children of sequence root node of "tree".
 */
__isl_give isl_schedule_node *isl_schedule_node_sequence_splice(
	__isl_take isl_schedule_node *node, int pos,
	__isl_take isl_schedule_tree *tree)
{
	isl_schedule_tree *node_tree;

	if (!node || !tree)
		goto error;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a sequence node", goto error);
	if (isl_schedule_tree_get_type(tree) != isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a sequence node", goto error);
	node_tree = isl_schedule_node_get_tree(node);
	node_tree = isl_schedule_tree_sequence_splice(node_tree, pos, tree);
	node = isl_schedule_node_graft_tree(node, node_tree);

	return node;
error:
	isl_schedule_node_free(node);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Given a sequence node "node", with a child at position "pos" that
 * is also a sequence node, attach the children of that node directly
 * as children of "node" at that position, replacing the original child.
 *
 * The filters of these children are intersected with the filter
 * of the child at position "pos".
 */
__isl_give isl_schedule_node *isl_schedule_node_sequence_splice_child(
	__isl_take isl_schedule_node *node, int pos)
{
	int i;
	isl_size n;
	isl_union_set *filter;
	isl_schedule_node *child;
	isl_schedule_tree *tree;

	if (!node)
		return NULL;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a sequence node",
			return isl_schedule_node_free(node));
	node = isl_schedule_node_child(node, pos);
	node = isl_schedule_node_child(node, 0);
	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a sequence node",
			return isl_schedule_node_free(node));
	n = isl_schedule_node_n_children(node);
	if (n < 0)
		return isl_schedule_node_free(node);
	child = isl_schedule_node_copy(node);
	node = isl_schedule_node_parent(node);
	filter = isl_schedule_node_filter_get_filter(node);
	for (i = 0; i < n; ++i) {
		child = isl_schedule_node_child(child, i);
		child = isl_schedule_node_filter_intersect_filter(child,
						isl_union_set_copy(filter));
		child = isl_schedule_node_parent(child);
	}
	isl_union_set_free(filter);
	tree = isl_schedule_node_get_tree(child);
	isl_schedule_node_free(child);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_sequence_splice(node, pos, tree);

	return node;
}

/* Update the ancestors of "node" to point to the tree that "node"
 * now points to.
 * That is, replace the child in the original parent that corresponds
 * to the current tree position by node->tree and continue updating
 * the ancestors in the same way until the root is reached.
 *
 * If "fn" is not NULL, then it is called on each ancestor as we move up
 * the tree so that it can modify the ancestor before it is added
 * to the list of ancestors of the modified node.
 * The additional "pos" argument records the position
 * of the "tree" argument in the original schedule tree.
 *
 * If "node" originally points to a leaf of the schedule tree, then make sure
 * that in the end it points to a leaf in the updated schedule tree.
 */
static __isl_give isl_schedule_node *update_ancestors(
	__isl_take isl_schedule_node *node,
	__isl_give isl_schedule_tree *(*fn)(__isl_take isl_schedule_tree *tree,
		__isl_keep isl_schedule_node *pos, void *user), void *user)
{
	int i;
	isl_size n;
	int is_leaf;
	isl_schedule_tree *tree;
	isl_schedule_node *pos = NULL;

	if (fn)
		pos = isl_schedule_node_copy(node);

	node = isl_schedule_node_cow(node);
	if (!node)
		return isl_schedule_node_free(pos);

	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_schedule_node_free(pos);
	tree = isl_schedule_tree_copy(node->tree);

	for (i = n - 1; i >= 0; --i) {
		isl_schedule_tree *parent;

		parent = isl_schedule_tree_list_get_schedule_tree(
						    node->ancestors, i);
		parent = isl_schedule_tree_replace_child(parent,
						    node->child_pos[i], tree);
		if (fn) {
			pos = isl_schedule_node_parent(pos);
			parent = fn(parent, pos, user);
		}
		node->ancestors = isl_schedule_tree_list_set_schedule_tree(
			    node->ancestors, i, isl_schedule_tree_copy(parent));

		tree = parent;
	}

	if (fn)
		isl_schedule_node_free(pos);

	is_leaf = isl_schedule_tree_is_leaf(node->tree);
	node->schedule = isl_schedule_set_root(node->schedule, tree);
	if (is_leaf) {
		isl_schedule_tree_free(node->tree);
		node->tree = isl_schedule_node_get_leaf(node);
	}

	if (!node->schedule || !node->ancestors)
		return isl_schedule_node_free(node);

	return node;
}

/* Replace the subtree that "pos" points to by "tree", updating
 * the ancestors to maintain a consistent state.
 */
__isl_give isl_schedule_node *isl_schedule_node_graft_tree(
	__isl_take isl_schedule_node *pos, __isl_take isl_schedule_tree *tree)
{
	if (!tree || !pos)
		goto error;
	if (pos->tree == tree) {
		isl_schedule_tree_free(tree);
		return pos;
	}

	pos = isl_schedule_node_cow(pos);
	if (!pos)
		goto error;

	isl_schedule_tree_free(pos->tree);
	pos->tree = tree;

	return update_ancestors(pos, NULL, NULL);
error:
	isl_schedule_node_free(pos);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Make sure we can insert a node between "node" and its parent.
 * Return -1 on error, reporting the reason why we cannot insert a node.
 */
static int check_insert(__isl_keep isl_schedule_node *node)
{
	int has_parent;
	enum isl_schedule_node_type type;

	has_parent = isl_schedule_node_has_parent(node);
	if (has_parent < 0)
		return -1;
	if (!has_parent)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot insert node outside of root", return -1);

	type = isl_schedule_node_get_parent_type(node);
	if (type == isl_schedule_node_error)
		return -1;
	if (type == isl_schedule_node_set || type == isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot insert node between set or sequence node "
			"and its filter children", return -1);

	return 0;
}

/* Insert a band node with partial schedule "mupa" between "node" and
 * its parent.
 * Return a pointer to the new band node.
 *
 * If any of the nodes in the subtree rooted at "node" depend on
 * the set of outer band nodes then we refuse to insert the band node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_partial_schedule(
	__isl_take isl_schedule_node *node,
	__isl_take isl_multi_union_pw_aff *mupa)
{
	int anchored;
	isl_schedule_band *band;
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (anchored < 0)
		goto error;
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot insert band node in anchored subtree",
			goto error);

	tree = isl_schedule_node_get_tree(node);
	band = isl_schedule_band_from_multi_union_pw_aff(mupa);
	tree = isl_schedule_tree_insert_band(tree, band);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
error:
	isl_schedule_node_free(node);
	isl_multi_union_pw_aff_free(mupa);
	return NULL;
}

/* Insert a context node with context "context" between "node" and its parent.
 * Return a pointer to the new context node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_context(
	__isl_take isl_schedule_node *node, __isl_take isl_set *context)
{
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_context(tree, context);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Insert an expansion node with the given "contraction" and "expansion"
 * between "node" and its parent.
 * Return a pointer to the new expansion node.
 *
 * Typically the domain and range spaces of the expansion are different.
 * This means that only one of them can refer to the current domain space
 * in a consistent tree.  It is up to the caller to ensure that the tree
 * returns to a consistent state.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_expansion(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion)
{
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_expansion(tree, contraction, expansion);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Insert an extension node with extension "extension" between "node" and
 * its parent.
 * Return a pointer to the new extension node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_extension(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_map *extension)
{
	isl_schedule_tree *tree;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_extension(tree, extension);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Insert a filter node with filter "filter" between "node" and its parent.
 * Return a pointer to the new filter node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_filter(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter)
{
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_filter(tree, filter);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Insert a guard node with guard "guard" between "node" and its parent.
 * Return a pointer to the new guard node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_guard(
	__isl_take isl_schedule_node *node, __isl_take isl_set *guard)
{
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_guard(tree, guard);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Insert a mark node with mark identifier "mark" between "node" and
 * its parent.
 * Return a pointer to the new mark node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_mark(
	__isl_take isl_schedule_node *node, __isl_take isl_id *mark)
{
	isl_schedule_tree *tree;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_insert_mark(tree, mark);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Attach the current subtree of "node" to a sequence of filter tree nodes
 * with filters described by "filters", attach this sequence
 * of filter tree nodes as children to a new tree of type "type" and
 * replace the original subtree of "node" by this new tree.
 * Each copy of the original subtree is simplified with respect
 * to the corresponding filter.
 */
static __isl_give isl_schedule_node *isl_schedule_node_insert_children(
	__isl_take isl_schedule_node *node,
	enum isl_schedule_node_type type,
	__isl_take isl_union_set_list *filters)
{
	int i;
	isl_size n;
	isl_ctx *ctx;
	isl_schedule_tree *tree;
	isl_schedule_tree_list *list;

	if (check_insert(node) < 0)
		node = isl_schedule_node_free(node);

	n = isl_union_set_list_n_union_set(filters);
	if (!node || n < 0)
		goto error;

	ctx = isl_schedule_node_get_ctx(node);
	list = isl_schedule_tree_list_alloc(ctx, n);
	for (i = 0; i < n; ++i) {
		isl_schedule_node *node_i;
		isl_schedule_tree *tree;
		isl_union_set *filter;

		filter = isl_union_set_list_get_union_set(filters, i);
		node_i = isl_schedule_node_copy(node);
		node_i = isl_schedule_node_gist(node_i,
						isl_union_set_copy(filter));
		tree = isl_schedule_node_get_tree(node_i);
		isl_schedule_node_free(node_i);
		tree = isl_schedule_tree_insert_filter(tree, filter);
		list = isl_schedule_tree_list_add(list, tree);
	}
	tree = isl_schedule_tree_from_children(type, list);
	node = isl_schedule_node_graft_tree(node, tree);

	isl_union_set_list_free(filters);
	return node;
error:
	isl_union_set_list_free(filters);
	isl_schedule_node_free(node);
	return NULL;
}

/* Insert a sequence node with child filters "filters" between "node" and
 * its parent.  That is, the tree that "node" points to is attached
 * to each of the child nodes of the filter nodes.
 * Return a pointer to the new sequence node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_sequence(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_set_list *filters)
{
	return isl_schedule_node_insert_children(node,
					isl_schedule_node_sequence, filters);
}

/* Insert a set node with child filters "filters" between "node" and
 * its parent.  That is, the tree that "node" points to is attached
 * to each of the child nodes of the filter nodes.
 * Return a pointer to the new set node.
 */
__isl_give isl_schedule_node *isl_schedule_node_insert_set(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_set_list *filters)
{
	return isl_schedule_node_insert_children(node,
					isl_schedule_node_set, filters);
}

/* Remove "node" from its schedule tree and return a pointer
 * to the leaf at the same position in the updated schedule tree.
 *
 * It is not allowed to remove the root of a schedule tree or
 * a child of a set or sequence node.
 */
__isl_give isl_schedule_node *isl_schedule_node_cut(
	__isl_take isl_schedule_node *node)
{
	isl_schedule_tree *leaf;
	enum isl_schedule_node_type parent_type;

	if (!node)
		return NULL;
	if (!isl_schedule_node_has_parent(node))
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot cut root", return isl_schedule_node_free(node));

	parent_type = isl_schedule_node_get_parent_type(node);
	if (parent_type == isl_schedule_node_set ||
	    parent_type == isl_schedule_node_sequence)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot cut child of set or sequence",
			return isl_schedule_node_free(node));

	leaf = isl_schedule_node_get_leaf(node);
	return isl_schedule_node_graft_tree(node, leaf);
}

/* Remove a single node from the schedule tree, attaching the child
 * of "node" directly to its parent.
 * Return a pointer to this former child or to the leaf the position
 * of the original node if there was no child.
 * It is not allowed to remove the root of a schedule tree,
 * a set or sequence node, a child of a set or sequence node or
 * a band node with an anchored subtree.
 */
__isl_give isl_schedule_node *isl_schedule_node_delete(
	__isl_take isl_schedule_node *node)
{
	isl_size n, depth;
	isl_schedule_tree *tree;
	enum isl_schedule_node_type type;

	depth = isl_schedule_node_get_tree_depth(node);
	n = isl_schedule_node_n_children(node);
	if (depth < 0 || n < 0)
		return isl_schedule_node_free(node);

	if (depth == 0)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot delete root node",
			return isl_schedule_node_free(node));
	if (n != 1)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"can only delete node with a single child",
			return isl_schedule_node_free(node));
	type = isl_schedule_node_get_parent_type(node);
	if (type == isl_schedule_node_sequence || type == isl_schedule_node_set)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"cannot delete child of set or sequence",
			return isl_schedule_node_free(node));
	if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
		int anchored;

		anchored = isl_schedule_node_is_subtree_anchored(node);
		if (anchored < 0)
			return isl_schedule_node_free(node);
		if (anchored)
			isl_die(isl_schedule_node_get_ctx(node),
				isl_error_invalid,
				"cannot delete band node with anchored subtree",
				return isl_schedule_node_free(node));
	}

	tree = isl_schedule_node_get_tree(node);
	if (!tree || isl_schedule_tree_has_children(tree)) {
		tree = isl_schedule_tree_child(tree, 0);
	} else {
		isl_schedule_tree_free(tree);
		tree = isl_schedule_node_get_leaf(node);
	}
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Internal data structure for the group_ancestor callback.
 *
 * If "finished" is set, then we no longer need to modify
 * any further ancestors.
 *
 * "contraction" and "expansion" represent the expansion
 * that reflects the grouping.
 *
 * "domain" contains the domain elements that reach the position
 * where the grouping is performed.  That is, it is the range
 * of the resulting expansion.
 * "domain_universe" is the universe of "domain".
 * "group" is the set of group elements, i.e., the domain
 * of the resulting expansion.
 * "group_universe" is the universe of "group".
 *
 * "sched" is the schedule for the group elements, in pratice
 * an identity mapping on "group_universe".
 * "dim" is the dimension of "sched".
 */
struct isl_schedule_group_data {
	int finished;

	isl_union_map *expansion;
	isl_union_pw_multi_aff *contraction;

	isl_union_set *domain;
	isl_union_set *domain_universe;
	isl_union_set *group;
	isl_union_set *group_universe;

	int dim;
	isl_multi_aff *sched;
};

/* Is domain covered by data->domain within data->domain_universe?
 */
static isl_bool locally_covered_by_domain(__isl_keep isl_union_set *domain,
	struct isl_schedule_group_data *data)
{
	isl_bool is_subset;
	isl_union_set *test;

	test = isl_union_set_copy(domain);
	test = isl_union_set_intersect(test,
			    isl_union_set_copy(data->domain_universe));
	is_subset = isl_union_set_is_subset(test, data->domain);
	isl_union_set_free(test);

	return is_subset;
}

/* Update the band tree root "tree" to refer to the group instances
 * in data->group rather than the original domain elements in data->domain.
 * "pos" is the position in the original schedule tree where the modified
 * "tree" will be attached.
 *
 * Add the part of the identity schedule on the group instances data->sched
 * that corresponds to this band node to the band schedule.
 * If the domain elements that reach the node and that are part
 * of data->domain_universe are all elements of data->domain (and therefore
 * replaced by the group instances) then this data->domain_universe
 * is removed from the domain of the band schedule.
 */
static __isl_give isl_schedule_tree *group_band(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_node *pos,
	struct isl_schedule_group_data *data)
{
	isl_union_set *domain;
	isl_multi_aff *ma;
	isl_multi_union_pw_aff *mupa, *partial;
	isl_bool is_covered;
	isl_size depth, n;
	isl_bool has_id;

	domain = isl_schedule_node_get_domain(pos);
	is_covered = locally_covered_by_domain(domain, data);
	if (is_covered >= 0 && is_covered) {
		domain = isl_union_set_universe(domain);
		domain = isl_union_set_subtract(domain,
			    isl_union_set_copy(data->domain_universe));
		tree = isl_schedule_tree_band_intersect_domain(tree, domain);
	} else
		isl_union_set_free(domain);
	if (is_covered < 0)
		return isl_schedule_tree_free(tree);
	depth = isl_schedule_node_get_schedule_depth(pos);
	n = isl_schedule_tree_band_n_member(tree);
	if (depth < 0 || n < 0)
		return isl_schedule_tree_free(tree);
	ma = isl_multi_aff_copy(data->sched);
	ma = isl_multi_aff_drop_dims(ma, isl_dim_out, 0, depth);
	ma = isl_multi_aff_drop_dims(ma, isl_dim_out, n, data->dim - depth - n);
	mupa = isl_multi_union_pw_aff_from_multi_aff(ma);
	partial = isl_schedule_tree_band_get_partial_schedule(tree);
	has_id = isl_multi_union_pw_aff_has_tuple_id(partial, isl_dim_set);
	if (has_id < 0) {
		partial = isl_multi_union_pw_aff_free(partial);
	} else if (has_id) {
		isl_id *id;
		id = isl_multi_union_pw_aff_get_tuple_id(partial, isl_dim_set);
		mupa = isl_multi_union_pw_aff_set_tuple_id(mupa,
							    isl_dim_set, id);
	}
	partial = isl_multi_union_pw_aff_union_add(partial, mupa);
	tree = isl_schedule_tree_band_set_partial_schedule(tree, partial);

	return tree;
}

/* Drop the parameters in "uset" that are not also in "space".
 * "n" is the number of parameters in "space".
 */
static __isl_give isl_union_set *union_set_drop_extra_params(
	__isl_take isl_union_set *uset, __isl_keep isl_space *space, int n)
{
	isl_size n2;

	uset = isl_union_set_align_params(uset, isl_space_copy(space));
	n2 = isl_union_set_dim(uset, isl_dim_param);
	if (n2 < 0)
		return isl_union_set_free(uset);
	uset = isl_union_set_project_out(uset, isl_dim_param, n, n2 - n);

	return uset;
}

/* Update the context tree root "tree" to refer to the group instances
 * in data->group rather than the original domain elements in data->domain.
 * "pos" is the position in the original schedule tree where the modified
 * "tree" will be attached.
 *
 * We do not actually need to update "tree" since a context node only
 * refers to the schedule space.  However, we may need to update "data"
 * to not refer to any parameters introduced by the context node.
 */
static __isl_give isl_schedule_tree *group_context(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_node *pos,
	struct isl_schedule_group_data *data)
{
	isl_space *space;
	isl_union_set *domain;
	isl_size n1, n2;
	isl_bool involves;
	isl_size depth;

	depth = isl_schedule_node_get_tree_depth(pos);
	if (depth < 0)
		return isl_schedule_tree_free(tree);
	if (depth == 1)
		return tree;

	domain = isl_schedule_node_get_universe_domain(pos);
	space = isl_union_set_get_space(domain);
	isl_union_set_free(domain);

	n1 = isl_space_dim(space, isl_dim_param);
	data->expansion = isl_union_map_align_params(data->expansion, space);
	n2 = isl_union_map_dim(data->expansion, isl_dim_param);

	if (n1 < 0 || n2 < 0)
		return isl_schedule_tree_free(tree);
	if (n1 == n2)
		return tree;

	involves = isl_union_map_involves_dims(data->expansion,
				isl_dim_param, n1, n2 - n1);
	if (involves < 0)
		return isl_schedule_tree_free(tree);
	if (involves)
		isl_die(isl_schedule_node_get_ctx(pos), isl_error_invalid,
			"grouping cannot only refer to global parameters",
			return isl_schedule_tree_free(tree));

	data->expansion = isl_union_map_project_out(data->expansion,
				isl_dim_param, n1, n2 - n1);
	space = isl_union_map_get_space(data->expansion);

	data->contraction = isl_union_pw_multi_aff_align_params(
				data->contraction, isl_space_copy(space));
	n2 = isl_union_pw_multi_aff_dim(data->contraction, isl_dim_param);
	if (n2 < 0)
		data->contraction =
				isl_union_pw_multi_aff_free(data->contraction);
	data->contraction = isl_union_pw_multi_aff_drop_dims(data->contraction,
				isl_dim_param, n1, n2 - n1);

	data->domain = union_set_drop_extra_params(data->domain, space, n1);
	data->domain_universe =
		union_set_drop_extra_params(data->domain_universe, space, n1);
	data->group = union_set_drop_extra_params(data->group, space, n1);
	data->group_universe =
		union_set_drop_extra_params(data->group_universe, space, n1);

	data->sched = isl_multi_aff_align_params(data->sched,
				isl_space_copy(space));
	n2 = isl_multi_aff_dim(data->sched, isl_dim_param);
	if (n2 < 0)
		data->sched = isl_multi_aff_free(data->sched);
	data->sched = isl_multi_aff_drop_dims(data->sched,
				isl_dim_param, n1, n2 - n1);

	isl_space_free(space);

	return tree;
}

/* Update the domain tree root "tree" to refer to the group instances
 * in data->group rather than the original domain elements in data->domain.
 * "pos" is the position in the original schedule tree where the modified
 * "tree" will be attached.
 *
 * We first double-check that all grouped domain elements are actually
 * part of the root domain and then replace those elements by the group
 * instances.
 */
static __isl_give isl_schedule_tree *group_domain(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_node *pos,
	struct isl_schedule_group_data *data)
{
	isl_union_set *domain;
	isl_bool is_subset;

	domain = isl_schedule_tree_domain_get_domain(tree);
	is_subset = isl_union_set_is_subset(data->domain, domain);
	isl_union_set_free(domain);
	if (is_subset < 0)
		return isl_schedule_tree_free(tree);
	if (!is_subset)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"grouped domain should be part of outer domain",
			return isl_schedule_tree_free(tree));
	domain = isl_schedule_tree_domain_get_domain(tree);
	domain = isl_union_set_subtract(domain,
				isl_union_set_copy(data->domain));
	domain = isl_union_set_union(domain, isl_union_set_copy(data->group));
	tree = isl_schedule_tree_domain_set_domain(tree, domain);

	return tree;
}

/* Update the expansion tree root "tree" to refer to the group instances
 * in data->group rather than the original domain elements in data->domain.
 * "pos" is the position in the original schedule tree where the modified
 * "tree" will be attached.
 *
 * Let G_1 -> D_1 be the expansion of "tree" and G_2 -> D_2 the newly
 * introduced expansion in a descendant of "tree".
 * We first double-check that D_2 is a subset of D_1.
 * Then we remove D_2 from the range of G_1 -> D_1 and add the mapping
 * G_1 -> D_1 . D_2 -> G_2.
 * Simmilarly, we restrict the domain of the contraction to the universe
 * of the range of the updated expansion and add G_2 -> D_2 . D_1 -> G_1,
 * attempting to remove the domain constraints of this additional part.
 */
static __isl_give isl_schedule_tree *group_expansion(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_node *pos,
	struct isl_schedule_group_data *data)
{
	isl_union_set *domain;
	isl_union_map *expansion, *umap;
	isl_union_pw_multi_aff *contraction, *upma;
	int is_subset;

	expansion = isl_schedule_tree_expansion_get_expansion(tree);
	domain = isl_union_map_range(expansion);
	is_subset = isl_union_set_is_subset(data->domain, domain);
	isl_union_set_free(domain);
	if (is_subset < 0)
		return isl_schedule_tree_free(tree);
	if (!is_subset)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"grouped domain should be part "
			"of outer expansion domain",
			return isl_schedule_tree_free(tree));
	expansion = isl_schedule_tree_expansion_get_expansion(tree);
	umap = isl_union_map_from_union_pw_multi_aff(
			isl_union_pw_multi_aff_copy(data->contraction));
	umap = isl_union_map_apply_range(expansion, umap);
	expansion = isl_schedule_tree_expansion_get_expansion(tree);
	expansion = isl_union_map_subtract_range(expansion,
				isl_union_set_copy(data->domain));
	expansion = isl_union_map_union(expansion, umap);
	umap = isl_union_map_universe(isl_union_map_copy(expansion));
	domain = isl_union_map_range(umap);
	contraction = isl_schedule_tree_expansion_get_contraction(tree);
	umap = isl_union_map_from_union_pw_multi_aff(contraction);
	umap = isl_union_map_apply_range(isl_union_map_copy(data->expansion),
					umap);
	upma = isl_union_pw_multi_aff_from_union_map(umap);
	contraction = isl_schedule_tree_expansion_get_contraction(tree);
	contraction = isl_union_pw_multi_aff_intersect_domain(contraction,
								domain);
	domain = isl_union_pw_multi_aff_domain(
				isl_union_pw_multi_aff_copy(upma));
	upma = isl_union_pw_multi_aff_gist(upma, domain);
	contraction = isl_union_pw_multi_aff_union_add(contraction, upma);
	tree = isl_schedule_tree_expansion_set_contraction_and_expansion(tree,
							contraction, expansion);

	return tree;
}

/* Update the tree root "tree" to refer to the group instances
 * in data->group rather than the original domain elements in data->domain.
 * "pos" is the position in the original schedule tree where the modified
 * "tree" will be attached.
 *
 * If we have come across a domain or expansion node before (data->finished
 * is set), then we no longer need perform any modifications.
 *
 * If "tree" is a filter, then we add data->group_universe to the filter.
 * We also remove data->domain_universe from the filter if all the domain
 * elements in this universe that reach the filter node are part of
 * the elements that are being grouped by data->expansion.
 * If "tree" is a band, domain or expansion, then it is handled
 * in a separate function.
 */
static __isl_give isl_schedule_tree *group_ancestor(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_node *pos,
	void *user)
{
	struct isl_schedule_group_data *data = user;
	isl_union_set *domain;
	isl_bool is_covered;

	if (!tree || !pos)
		return isl_schedule_tree_free(tree);

	if (data->finished)
		return tree;

	switch (isl_schedule_tree_get_type(tree)) {
	case isl_schedule_node_error:
		return isl_schedule_tree_free(tree);
	case isl_schedule_node_extension:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_unsupported,
			"grouping not allowed in extended tree",
			return isl_schedule_tree_free(tree));
	case isl_schedule_node_band:
		tree = group_band(tree, pos, data);
		break;
	case isl_schedule_node_context:
		tree = group_context(tree, pos, data);
		break;
	case isl_schedule_node_domain:
		tree = group_domain(tree, pos, data);
		data->finished = 1;
		break;
	case isl_schedule_node_filter:
		domain = isl_schedule_node_get_domain(pos);
		is_covered = locally_covered_by_domain(domain, data);
		isl_union_set_free(domain);
		if (is_covered < 0)
			return isl_schedule_tree_free(tree);
		domain = isl_schedule_tree_filter_get_filter(tree);
		if (is_covered)
			domain = isl_union_set_subtract(domain,
				    isl_union_set_copy(data->domain_universe));
		domain = isl_union_set_union(domain,
				    isl_union_set_copy(data->group_universe));
		tree = isl_schedule_tree_filter_set_filter(tree, domain);
		break;
	case isl_schedule_node_expansion:
		tree = group_expansion(tree, pos, data);
		data->finished = 1;
		break;
	case isl_schedule_node_leaf:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	}

	return tree;
}

/* Group the domain elements that reach "node" into instances
 * of a single statement with identifier "group_id".
 * In particular, group the domain elements according to their
 * prefix schedule.
 *
 * That is, introduce an expansion node with as contraction
 * the prefix schedule (with the target space replaced by "group_id")
 * and as expansion the inverse of this contraction (with its range
 * intersected with the domain elements that reach "node").
 * The outer nodes are then modified to refer to the group instances
 * instead of the original domain elements.
 *
 * No instance of "group_id" is allowed to reach "node" prior
 * to the grouping.
 * No ancestor of "node" is allowed to be an extension node.
 *
 * Return a pointer to original node in tree, i.e., the child
 * of the newly introduced expansion node.
 */
__isl_give isl_schedule_node *isl_schedule_node_group(
	__isl_take isl_schedule_node *node, __isl_take isl_id *group_id)
{
	struct isl_schedule_group_data data = { 0 };
	isl_space *space;
	isl_union_set *domain;
	isl_union_pw_multi_aff *contraction;
	isl_union_map *expansion;
	isl_bool disjoint;
	isl_size depth;

	depth = isl_schedule_node_get_schedule_depth(node);
	if (depth < 0 || !group_id)
		goto error;
	if (check_insert(node) < 0)
		goto error;

	domain = isl_schedule_node_get_domain(node);
	data.domain = isl_union_set_copy(domain);
	data.domain_universe = isl_union_set_copy(domain);
	data.domain_universe = isl_union_set_universe(data.domain_universe);

	data.dim = depth;
	if (data.dim == 0) {
		isl_ctx *ctx;
		isl_set *set;
		isl_union_set *group;
		isl_union_map *univ;

		ctx = isl_schedule_node_get_ctx(node);
		space = isl_space_set_alloc(ctx, 0, 0);
		space = isl_space_set_tuple_id(space, isl_dim_set, group_id);
		set = isl_set_universe(isl_space_copy(space));
		group = isl_union_set_from_set(set);
		expansion = isl_union_map_from_domain_and_range(domain, group);
		univ = isl_union_map_universe(isl_union_map_copy(expansion));
		contraction = isl_union_pw_multi_aff_from_union_map(univ);
		expansion = isl_union_map_reverse(expansion);
	} else {
		isl_multi_union_pw_aff *prefix;
		isl_union_set *univ;

		prefix =
		isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
		prefix = isl_multi_union_pw_aff_set_tuple_id(prefix,
							isl_dim_set, group_id);
		space = isl_multi_union_pw_aff_get_space(prefix);
		contraction = isl_union_pw_multi_aff_from_multi_union_pw_aff(
							prefix);
		univ = isl_union_set_universe(isl_union_set_copy(domain));
		contraction =
		    isl_union_pw_multi_aff_intersect_domain(contraction, univ);
		expansion = isl_union_map_from_union_pw_multi_aff(
				    isl_union_pw_multi_aff_copy(contraction));
		expansion = isl_union_map_reverse(expansion);
		expansion = isl_union_map_intersect_range(expansion, domain);
	}
	space = isl_space_map_from_set(space);
	data.sched = isl_multi_aff_identity(space);
	data.group = isl_union_map_domain(isl_union_map_copy(expansion));
	data.group = isl_union_set_coalesce(data.group);
	data.group_universe = isl_union_set_copy(data.group);
	data.group_universe = isl_union_set_universe(data.group_universe);
	data.expansion = isl_union_map_copy(expansion);
	data.contraction = isl_union_pw_multi_aff_copy(contraction);
	node = isl_schedule_node_insert_expansion(node, contraction, expansion);

	disjoint = isl_union_set_is_disjoint(data.domain_universe,
					    data.group_universe);

	node = update_ancestors(node, &group_ancestor, &data);

	isl_union_set_free(data.domain);
	isl_union_set_free(data.domain_universe);
	isl_union_set_free(data.group);
	isl_union_set_free(data.group_universe);
	isl_multi_aff_free(data.sched);
	isl_union_map_free(data.expansion);
	isl_union_pw_multi_aff_free(data.contraction);

	node = isl_schedule_node_child(node, 0);

	if (!node || disjoint < 0)
		return isl_schedule_node_free(node);
	if (!disjoint)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"group instances already reach node",
			return isl_schedule_node_free(node));

	return node;
error:
	isl_schedule_node_free(node);
	isl_id_free(group_id);
	return NULL;
}

/* Compute the gist of the given band node with respect to "context".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_gist(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *context)
{
	isl_schedule_tree *tree;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_band_gist(tree, context);
	return isl_schedule_node_graft_tree(node, tree);
}

/* Internal data structure for isl_schedule_node_gist.
 * "n_expansion" is the number of outer expansion nodes
 * with respect to the current position
 * "filters" contains an element for each outer filter, expansion or
 * extension node with respect to the current position, each representing
 * the intersection of the previous element and the filter on the filter node
 * or the expansion/extension of the previous element.
 * The first element in the original context passed to isl_schedule_node_gist.
 */
struct isl_node_gist_data {
	int n_expansion;
	isl_union_set_list *filters;
};

/* Enter the expansion node "node" during a isl_schedule_node_gist traversal.
 *
 * In particular, add an extra element to data->filters containing
 * the expansion of the previous element and replace the expansion
 * and contraction on "node" by the gist with respect to these filters.
 * Also keep track of the fact that we have entered another expansion.
 */
static __isl_give isl_schedule_node *gist_enter_expansion(
	__isl_take isl_schedule_node *node, struct isl_node_gist_data *data)
{
	isl_size n;
	isl_union_set *inner;
	isl_union_map *expansion;
	isl_union_pw_multi_aff *contraction;

	data->n_expansion++;

	n = isl_union_set_list_n_union_set(data->filters);
	if (n < 0)
		return isl_schedule_node_free(node);
	inner = isl_union_set_list_get_union_set(data->filters, n - 1);
	expansion = isl_schedule_node_expansion_get_expansion(node);
	inner = isl_union_set_apply(inner, expansion);

	contraction = isl_schedule_node_expansion_get_contraction(node);
	contraction = isl_union_pw_multi_aff_gist(contraction,
						isl_union_set_copy(inner));

	data->filters = isl_union_set_list_add(data->filters, inner);

	inner = isl_union_set_list_get_union_set(data->filters, n - 1);
	expansion = isl_schedule_node_expansion_get_expansion(node);
	expansion = isl_union_map_gist_domain(expansion, inner);
	node = isl_schedule_node_expansion_set_contraction_and_expansion(node,
						contraction, expansion);

	return node;
}

/* Leave the expansion node "node" during a isl_schedule_node_gist traversal.
 *
 * In particular, remove the element in data->filters that was added by
 * gist_enter_expansion and decrement the number of outer expansions.
 *
 * The expansion has already been simplified in gist_enter_expansion.
 * If this simplification results in an identity expansion, then
 * it is removed here.
 */
static __isl_give isl_schedule_node *gist_leave_expansion(
	__isl_take isl_schedule_node *node, struct isl_node_gist_data *data)
{
	isl_size n;
	isl_bool identity;
	isl_union_map *expansion;

	expansion = isl_schedule_node_expansion_get_expansion(node);
	identity = isl_union_map_is_identity(expansion);
	isl_union_map_free(expansion);

	if (identity < 0)
		node = isl_schedule_node_free(node);
	else if (identity)
		node = isl_schedule_node_delete(node);

	n = isl_union_set_list_n_union_set(data->filters);
	if (n < 0)
		return isl_schedule_node_free(node);
	data->filters = isl_union_set_list_drop(data->filters, n - 1, 1);

	data->n_expansion--;

	return node;
}

/* Enter the extension node "node" during a isl_schedule_node_gist traversal.
 *
 * In particular, add an extra element to data->filters containing
 * the union of the previous element with the additional domain elements
 * introduced by the extension.
 */
static __isl_give isl_schedule_node *gist_enter_extension(
	__isl_take isl_schedule_node *node, struct isl_node_gist_data *data)
{
	isl_size n;
	isl_union_set *inner, *extra;
	isl_union_map *extension;

	n = isl_union_set_list_n_union_set(data->filters);
	if (n < 0)
		return isl_schedule_node_free(node);
	inner = isl_union_set_list_get_union_set(data->filters, n - 1);
	extension = isl_schedule_node_extension_get_extension(node);
	extra = isl_union_map_range(extension);
	inner = isl_union_set_union(inner, extra);

	data->filters = isl_union_set_list_add(data->filters, inner);

	return node;
}

/* Can we finish gisting at this node?
 * That is, is the filter on the current filter node a subset of
 * the original context passed to isl_schedule_node_gist?
 * If we have gone through any expansions, then we cannot perform
 * this test since the current domain elements are incomparable
 * to the domain elements in the original context.
 */
static isl_bool gist_done(__isl_keep isl_schedule_node *node,
	struct isl_node_gist_data *data)
{
	isl_union_set *filter, *outer;
	isl_bool subset;

	if (data->n_expansion != 0)
		return isl_bool_false;

	filter = isl_schedule_node_filter_get_filter(node);
	outer = isl_union_set_list_get_union_set(data->filters, 0);
	subset = isl_union_set_is_subset(filter, outer);
	isl_union_set_free(outer);
	isl_union_set_free(filter);

	return subset;
}

/* Callback for "traverse" to enter a node and to move
 * to the deepest initial subtree that should be traversed
 * by isl_schedule_node_gist.
 *
 * The "filters" list is extended by one element each time
 * we come across a filter node by the result of intersecting
 * the last element in the list with the filter on the filter node.
 *
 * If the filter on the current filter node is a subset of
 * the original context passed to isl_schedule_node_gist,
 * then there is no need to go into its subtree since it cannot
 * be further simplified by the context.  The "filters" list is
 * still extended for consistency, but the actual value of the
 * added element is immaterial since it will not be used.
 *
 * Otherwise, the filter on the current filter node is replaced by
 * the gist of the original filter with respect to the intersection
 * of the original context with the intermediate filters.
 *
 * If the new element in the "filters" list is empty, then no elements
 * can reach the descendants of the current filter node.  The subtree
 * underneath the filter node is therefore removed.
 *
 * Each expansion node we come across is handled by
 * gist_enter_expansion.
 *
 * Each extension node we come across is handled by
 * gist_enter_extension.
 */
static __isl_give isl_schedule_node *gist_enter(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_node_gist_data *data = user;

	do {
		isl_union_set *filter, *inner;
		isl_bool done, empty;
		isl_size n;

		switch (isl_schedule_node_get_type(node)) {
		case isl_schedule_node_error:
			return isl_schedule_node_free(node);
		case isl_schedule_node_expansion:
			node = gist_enter_expansion(node, data);
			continue;
		case isl_schedule_node_extension:
			node = gist_enter_extension(node, data);
			continue;
		case isl_schedule_node_band:
		case isl_schedule_node_context:
		case isl_schedule_node_domain:
		case isl_schedule_node_guard:
		case isl_schedule_node_leaf:
		case isl_schedule_node_mark:
		case isl_schedule_node_sequence:
		case isl_schedule_node_set:
			continue;
		case isl_schedule_node_filter:
			break;
		}
		done = gist_done(node, data);
		filter = isl_schedule_node_filter_get_filter(node);
		n = isl_union_set_list_n_union_set(data->filters);
		if (n < 0 || done < 0 || done) {
			data->filters = isl_union_set_list_add(data->filters,
								filter);
			if (n < 0 || done < 0)
				return isl_schedule_node_free(node);
			return node;
		}
		inner = isl_union_set_list_get_union_set(data->filters, n - 1);
		filter = isl_union_set_gist(filter, isl_union_set_copy(inner));
		node = isl_schedule_node_filter_set_filter(node,
						isl_union_set_copy(filter));
		filter = isl_union_set_intersect(filter, inner);
		empty = isl_union_set_is_empty(filter);
		data->filters = isl_union_set_list_add(data->filters, filter);
		if (empty < 0)
			return isl_schedule_node_free(node);
		if (!empty)
			continue;
		node = isl_schedule_node_child(node, 0);
		node = isl_schedule_node_cut(node);
		node = isl_schedule_node_parent(node);
		return node;
	} while (isl_schedule_node_has_children(node) &&
		(node = isl_schedule_node_first_child(node)) != NULL);

	return node;
}

/* Callback for "traverse" to leave a node for isl_schedule_node_gist.
 *
 * In particular, if the current node is a filter node, then we remove
 * the element on the "filters" list that was added when we entered
 * the node.  There is no need to compute any gist here, since we
 * already did that when we entered the node.
 *
 * Expansion nodes are handled by gist_leave_expansion.
 *
 * If the current node is an extension, then remove the element
 * in data->filters that was added by gist_enter_extension.
 *
 * If the current node is a band node, then we compute the gist of
 * the band node with respect to the intersection of the original context
 * and the intermediate filters.
 *
 * If the current node is a sequence or set node, then some of
 * the filter children may have become empty and so they are removed.
 * If only one child is left, then the set or sequence node along with
 * the single remaining child filter is removed.  The filter can be
 * removed because the filters on a sequence or set node are supposed
 * to partition the incoming domain instances.
 * In principle, it should then be impossible for there to be zero
 * remaining children, but should this happen, we replace the entire
 * subtree with an empty filter.
 */
static __isl_give isl_schedule_node *gist_leave(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_node_gist_data *data = user;
	isl_schedule_tree *tree;
	int i;
	isl_size n;
	isl_union_set *filter;

	switch (isl_schedule_node_get_type(node)) {
	case isl_schedule_node_error:
		return isl_schedule_node_free(node);
	case isl_schedule_node_expansion:
		node = gist_leave_expansion(node, data);
		break;
	case isl_schedule_node_extension:
	case isl_schedule_node_filter:
		n = isl_union_set_list_n_union_set(data->filters);
		if (n < 0)
			return isl_schedule_node_free(node);
		data->filters = isl_union_set_list_drop(data->filters,
							n - 1, 1);
		break;
	case isl_schedule_node_band:
		n = isl_union_set_list_n_union_set(data->filters);
		if (n < 0)
			return isl_schedule_node_free(node);
		filter = isl_union_set_list_get_union_set(data->filters, n - 1);
		node = isl_schedule_node_band_gist(node, filter);
		break;
	case isl_schedule_node_set:
	case isl_schedule_node_sequence:
		tree = isl_schedule_node_get_tree(node);
		n = isl_schedule_tree_n_children(tree);
		if (n < 0)
			tree = isl_schedule_tree_free(tree);
		for (i = n - 1; i >= 0; --i) {
			isl_schedule_tree *child;
			isl_union_set *filter;
			isl_bool empty;

			child = isl_schedule_tree_get_child(tree, i);
			filter = isl_schedule_tree_filter_get_filter(child);
			empty = isl_union_set_is_empty(filter);
			isl_union_set_free(filter);
			isl_schedule_tree_free(child);
			if (empty < 0)
				tree = isl_schedule_tree_free(tree);
			else if (empty)
				tree = isl_schedule_tree_drop_child(tree, i);
		}
		n = isl_schedule_tree_n_children(tree);
		if (n < 0)
			tree = isl_schedule_tree_free(tree);
		node = isl_schedule_node_graft_tree(node, tree);
		if (n == 1) {
			node = isl_schedule_node_delete(node);
			node = isl_schedule_node_delete(node);
		} else if (n == 0) {
			isl_space *space;

			filter =
			    isl_union_set_list_get_union_set(data->filters, 0);
			space = isl_union_set_get_space(filter);
			isl_union_set_free(filter);
			filter = isl_union_set_empty(space);
			node = isl_schedule_node_cut(node);
			node = isl_schedule_node_insert_filter(node, filter);
		}
		break;
	case isl_schedule_node_context:
	case isl_schedule_node_domain:
	case isl_schedule_node_guard:
	case isl_schedule_node_leaf:
	case isl_schedule_node_mark:
		break;
	}

	return node;
}

/* Compute the gist of the subtree at "node" with respect to
 * the reaching domain elements in "context".
 * In particular, compute the gist of all band and filter nodes
 * in the subtree with respect to "context".  Children of set or sequence
 * nodes that end up with an empty filter are removed completely.
 *
 * We keep track of the intersection of "context" with all outer filters
 * of the current node within the subtree in the final element of "filters".
 * Initially, this list contains the single element "context" and it is
 * extended or shortened each time we enter or leave a filter node.
 */
__isl_give isl_schedule_node *isl_schedule_node_gist(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *context)
{
	struct isl_node_gist_data data;

	data.n_expansion = 0;
	data.filters = isl_union_set_list_from_union_set(context);
	node = traverse(node, &gist_enter, &gist_leave, &data);
	isl_union_set_list_free(data.filters);
	return node;
}

/* Intersect the domain of domain node "node" with "domain".
 *
 * If the domain of "node" is already a subset of "domain",
 * then nothing needs to be changed.
 *
 * Otherwise, we replace the domain of the domain node by the intersection
 * and simplify the subtree rooted at "node" with respect to this intersection.
 */
__isl_give isl_schedule_node *isl_schedule_node_domain_intersect_domain(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *domain)
{
	isl_schedule_tree *tree;
	isl_union_set *uset;
	int is_subset;

	if (!node || !domain)
		goto error;

	uset = isl_schedule_tree_domain_get_domain(node->tree);
	is_subset = isl_union_set_is_subset(uset, domain);
	isl_union_set_free(uset);
	if (is_subset < 0)
		goto error;
	if (is_subset) {
		isl_union_set_free(domain);
		return node;
	}

	tree = isl_schedule_tree_copy(node->tree);
	uset = isl_schedule_tree_domain_get_domain(tree);
	uset = isl_union_set_intersect(uset, domain);
	tree = isl_schedule_tree_domain_set_domain(tree,
						    isl_union_set_copy(uset));
	node = isl_schedule_node_graft_tree(node, tree);

	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_gist(node, uset);
	node = isl_schedule_node_parent(node);

	return node;
error:
	isl_schedule_node_free(node);
	isl_union_set_free(domain);
	return NULL;
}

/* Replace the domain of domain node "node" with the gist
 * of the original domain with respect to the parameter domain "context".
 */
__isl_give isl_schedule_node *isl_schedule_node_domain_gist_params(
	__isl_take isl_schedule_node *node, __isl_take isl_set *context)
{
	isl_union_set *domain;
	isl_schedule_tree *tree;

	if (!node || !context)
		goto error;

	tree = isl_schedule_tree_copy(node->tree);
	domain = isl_schedule_tree_domain_get_domain(node->tree);
	domain = isl_union_set_gist_params(domain, context);
	tree = isl_schedule_tree_domain_set_domain(tree, domain);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
error:
	isl_schedule_node_free(node);
	isl_set_free(context);
	return NULL;
}

/* Internal data structure for isl_schedule_node_get_subtree_expansion.
 * "expansions" contains a list of accumulated expansions
 * for each outer expansion, set or sequence node.  The first element
 * in the list is an identity mapping on the reaching domain elements.
 * "res" collects the results.
 */
struct isl_subtree_expansion_data {
	isl_union_map_list *expansions;
	isl_union_map *res;
};

/* Callback for "traverse" to enter a node and to move
 * to the deepest initial subtree that should be traversed
 * by isl_schedule_node_get_subtree_expansion.
 *
 * Whenever we come across an expansion node, the last element
 * of data->expansions is combined with the expansion
 * on the expansion node.
 *
 * Whenever we come across a filter node that is the child
 * of a set or sequence node, data->expansions is extended
 * with a new element that restricts the previous element
 * to the elements selected by the filter.
 * The previous element can then be reused while backtracking.
 */
static __isl_give isl_schedule_node *subtree_expansion_enter(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_subtree_expansion_data *data = user;

	do {
		enum isl_schedule_node_type type;
		isl_union_set *filter;
		isl_union_map *inner, *expansion;
		isl_size n;

		switch (isl_schedule_node_get_type(node)) {
		case isl_schedule_node_error:
			return isl_schedule_node_free(node);
		case isl_schedule_node_filter:
			type = isl_schedule_node_get_parent_type(node);
			if (type != isl_schedule_node_set &&
			    type != isl_schedule_node_sequence)
				break;
			filter = isl_schedule_node_filter_get_filter(node);
			n = isl_union_map_list_n_union_map(data->expansions);
			if (n < 0)
				data->expansions =
				    isl_union_map_list_free(data->expansions);
			inner =
			    isl_union_map_list_get_union_map(data->expansions,
								n - 1);
			inner = isl_union_map_intersect_range(inner, filter);
			data->expansions =
			    isl_union_map_list_add(data->expansions, inner);
			break;
		case isl_schedule_node_expansion:
			n = isl_union_map_list_n_union_map(data->expansions);
			if (n < 0)
				data->expansions =
				    isl_union_map_list_free(data->expansions);
			expansion =
				isl_schedule_node_expansion_get_expansion(node);
			inner =
			    isl_union_map_list_get_union_map(data->expansions,
								n - 1);
			inner = isl_union_map_apply_range(inner, expansion);
			data->expansions =
			    isl_union_map_list_set_union_map(data->expansions,
								n - 1, inner);
			break;
		case isl_schedule_node_band:
		case isl_schedule_node_context:
		case isl_schedule_node_domain:
		case isl_schedule_node_extension:
		case isl_schedule_node_guard:
		case isl_schedule_node_leaf:
		case isl_schedule_node_mark:
		case isl_schedule_node_sequence:
		case isl_schedule_node_set:
			break;
		}
	} while (isl_schedule_node_has_children(node) &&
		(node = isl_schedule_node_first_child(node)) != NULL);

	return node;
}

/* Callback for "traverse" to leave a node for
 * isl_schedule_node_get_subtree_expansion.
 *
 * If we come across a filter node that is the child
 * of a set or sequence node, then we remove the element
 * of data->expansions that was added in subtree_expansion_enter.
 *
 * If we reach a leaf node, then the accumulated expansion is
 * added to data->res.
 */
static __isl_give isl_schedule_node *subtree_expansion_leave(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_subtree_expansion_data *data = user;
	isl_size n;
	isl_union_map *inner;
	enum isl_schedule_node_type type;

	switch (isl_schedule_node_get_type(node)) {
	case isl_schedule_node_error:
		return isl_schedule_node_free(node);
	case isl_schedule_node_filter:
		type = isl_schedule_node_get_parent_type(node);
		if (type != isl_schedule_node_set &&
		    type != isl_schedule_node_sequence)
			break;
		n = isl_union_map_list_n_union_map(data->expansions);
		if (n < 0)
			data->expansions =
				    isl_union_map_list_free(data->expansions);
		data->expansions = isl_union_map_list_drop(data->expansions,
							n - 1, 1);
		break;
	case isl_schedule_node_leaf:
		n = isl_union_map_list_n_union_map(data->expansions);
		if (n < 0)
			data->expansions =
				    isl_union_map_list_free(data->expansions);
		inner = isl_union_map_list_get_union_map(data->expansions,
							n - 1);
		data->res = isl_union_map_union(data->res, inner);
		break;
	case isl_schedule_node_band:
	case isl_schedule_node_context:
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_extension:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	}

	return node;
}

/* Return a mapping from the domain elements that reach "node"
 * to the corresponding domain elements in the leaves of the subtree
 * rooted at "node" obtained by composing the intermediate expansions.
 *
 * We start out with an identity mapping between the domain elements
 * that reach "node" and compose it with all the expansions
 * on a path from "node" to a leaf while traversing the subtree.
 * Within the children of an a sequence or set node, the
 * accumulated expansion is restricted to the elements selected
 * by the filter child.
 */
__isl_give isl_union_map *isl_schedule_node_get_subtree_expansion(
	__isl_keep isl_schedule_node *node)
{
	struct isl_subtree_expansion_data data;
	isl_space *space;
	isl_union_set *domain;
	isl_union_map *expansion;

	if (!node)
		return NULL;

	domain = isl_schedule_node_get_universe_domain(node);
	space = isl_union_set_get_space(domain);
	expansion = isl_union_set_identity(domain);
	data.res = isl_union_map_empty(space);
	data.expansions = isl_union_map_list_from_union_map(expansion);

	node = isl_schedule_node_copy(node);
	node = traverse(node, &subtree_expansion_enter,
			&subtree_expansion_leave, &data);
	if (!node)
		data.res = isl_union_map_free(data.res);
	isl_schedule_node_free(node);

	isl_union_map_list_free(data.expansions);

	return data.res;
}

/* Internal data structure for isl_schedule_node_get_subtree_contraction.
 * "contractions" contains a list of accumulated contractions
 * for each outer expansion, set or sequence node.  The first element
 * in the list is an identity mapping on the reaching domain elements.
 * "res" collects the results.
 */
struct isl_subtree_contraction_data {
	isl_union_pw_multi_aff_list *contractions;
	isl_union_pw_multi_aff *res;
};

/* Callback for "traverse" to enter a node and to move
 * to the deepest initial subtree that should be traversed
 * by isl_schedule_node_get_subtree_contraction.
 *
 * Whenever we come across an expansion node, the last element
 * of data->contractions is combined with the contraction
 * on the expansion node.
 *
 * Whenever we come across a filter node that is the child
 * of a set or sequence node, data->contractions is extended
 * with a new element that restricts the previous element
 * to the elements selected by the filter.
 * The previous element can then be reused while backtracking.
 */
static __isl_give isl_schedule_node *subtree_contraction_enter(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_subtree_contraction_data *data = user;

	do {
		enum isl_schedule_node_type type;
		isl_union_set *filter;
		isl_union_pw_multi_aff *inner, *contraction;
		isl_size n;

		switch (isl_schedule_node_get_type(node)) {
		case isl_schedule_node_error:
			return isl_schedule_node_free(node);
		case isl_schedule_node_filter:
			type = isl_schedule_node_get_parent_type(node);
			if (type != isl_schedule_node_set &&
			    type != isl_schedule_node_sequence)
				break;
			filter = isl_schedule_node_filter_get_filter(node);
			n = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(
						data->contractions);
			if (n < 0)
				data->contractions =
				    isl_union_pw_multi_aff_list_free(
							    data->contractions);
			inner =
			    isl_union_pw_multi_aff_list_get_union_pw_multi_aff(
						data->contractions, n - 1);
			inner = isl_union_pw_multi_aff_intersect_domain(inner,
								filter);
			data->contractions =
			    isl_union_pw_multi_aff_list_add(data->contractions,
								inner);
			break;
		case isl_schedule_node_expansion:
			n = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(
						data->contractions);
			if (n < 0)
				data->contractions =
				    isl_union_pw_multi_aff_list_free(
							    data->contractions);
			contraction =
			    isl_schedule_node_expansion_get_contraction(node);
			inner =
			    isl_union_pw_multi_aff_list_get_union_pw_multi_aff(
						data->contractions, n - 1);
			inner =
			    isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
						inner, contraction);
			data->contractions =
			    isl_union_pw_multi_aff_list_set_union_pw_multi_aff(
					data->contractions, n - 1, inner);
			break;
		case isl_schedule_node_band:
		case isl_schedule_node_context:
		case isl_schedule_node_domain:
		case isl_schedule_node_extension:
		case isl_schedule_node_guard:
		case isl_schedule_node_leaf:
		case isl_schedule_node_mark:
		case isl_schedule_node_sequence:
		case isl_schedule_node_set:
			break;
		}
	} while (isl_schedule_node_has_children(node) &&
		(node = isl_schedule_node_first_child(node)) != NULL);

	return node;
}

/* Callback for "traverse" to leave a node for
 * isl_schedule_node_get_subtree_contraction.
 *
 * If we come across a filter node that is the child
 * of a set or sequence node, then we remove the element
 * of data->contractions that was added in subtree_contraction_enter.
 *
 * If we reach a leaf node, then the accumulated contraction is
 * added to data->res.
 */
static __isl_give isl_schedule_node *subtree_contraction_leave(
	__isl_take isl_schedule_node *node, void *user)
{
	struct isl_subtree_contraction_data *data = user;
	isl_size n;
	isl_union_pw_multi_aff *inner;
	enum isl_schedule_node_type type;

	switch (isl_schedule_node_get_type(node)) {
	case isl_schedule_node_error:
		return isl_schedule_node_free(node);
	case isl_schedule_node_filter:
		type = isl_schedule_node_get_parent_type(node);
		if (type != isl_schedule_node_set &&
		    type != isl_schedule_node_sequence)
			break;
		n = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(
						data->contractions);
		if (n < 0)
			data->contractions = isl_union_pw_multi_aff_list_free(
							    data->contractions);
		data->contractions =
			isl_union_pw_multi_aff_list_drop(data->contractions,
							n - 1, 1);
		break;
	case isl_schedule_node_leaf:
		n = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(
						data->contractions);
		if (n < 0)
			data->contractions = isl_union_pw_multi_aff_list_free(
							    data->contractions);
		inner = isl_union_pw_multi_aff_list_get_union_pw_multi_aff(
						data->contractions, n - 1);
		data->res = isl_union_pw_multi_aff_union_add(data->res, inner);
		break;
	case isl_schedule_node_band:
	case isl_schedule_node_context:
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_extension:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	}

	return node;
}

/* Return a mapping from the domain elements in the leaves of the subtree
 * rooted at "node" to the corresponding domain elements that reach "node"
 * obtained by composing the intermediate contractions.
 *
 * We start out with an identity mapping between the domain elements
 * that reach "node" and compose it with all the contractions
 * on a path from "node" to a leaf while traversing the subtree.
 * Within the children of an a sequence or set node, the
 * accumulated contraction is restricted to the elements selected
 * by the filter child.
 */
__isl_give isl_union_pw_multi_aff *isl_schedule_node_get_subtree_contraction(
	__isl_keep isl_schedule_node *node)
{
	struct isl_subtree_contraction_data data;
	isl_space *space;
	isl_union_set *domain;
	isl_union_pw_multi_aff *contraction;

	if (!node)
		return NULL;

	domain = isl_schedule_node_get_universe_domain(node);
	space = isl_union_set_get_space(domain);
	contraction = isl_union_set_identity_union_pw_multi_aff(domain);
	data.res = isl_union_pw_multi_aff_empty(space);
	data.contractions =
	    isl_union_pw_multi_aff_list_from_union_pw_multi_aff(contraction);

	node = isl_schedule_node_copy(node);
	node = traverse(node, &subtree_contraction_enter,
			&subtree_contraction_leave, &data);
	if (!node)
		data.res = isl_union_pw_multi_aff_free(data.res);
	isl_schedule_node_free(node);

	isl_union_pw_multi_aff_list_free(data.contractions);

	return data.res;
}

/* Do the nearest "n" ancestors of "node" have the types given in "types"
 * (starting at the parent of "node")?
 */
static isl_bool has_ancestors(__isl_keep isl_schedule_node *node,
	int n, enum isl_schedule_node_type *types)
{
	int i;
	isl_size n_ancestor;

	if (!node)
		return isl_bool_error;

	n_ancestor = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n_ancestor < 0)
		return isl_bool_error;
	if (n_ancestor < n)
		return isl_bool_false;

	for (i = 0; i < n; ++i) {
		isl_schedule_tree *tree;
		int correct_type;

		tree = isl_schedule_tree_list_get_schedule_tree(node->ancestors,
							    n_ancestor - 1 - i);
		if (!tree)
			return isl_bool_error;
		correct_type = isl_schedule_tree_get_type(tree) == types[i];
		isl_schedule_tree_free(tree);
		if (!correct_type)
			return isl_bool_false;
	}

	return isl_bool_true;
}

/* Given a node "node" that appears in an extension (i.e., it is the child
 * of a filter in a sequence inside an extension node), are the spaces
 * of the extension specified by "extension" disjoint from those
 * of both the original extension and the domain elements that reach
 * that original extension?
 */
static int is_disjoint_extension(__isl_keep isl_schedule_node *node,
	__isl_keep isl_union_map *extension)
{
	isl_union_map *old;
	isl_union_set *domain;
	int empty;

	node = isl_schedule_node_copy(node);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);
	old = isl_schedule_node_extension_get_extension(node);
	domain = isl_schedule_node_get_universe_domain(node);
	isl_schedule_node_free(node);
	old = isl_union_map_universe(old);
	domain = isl_union_set_union(domain, isl_union_map_range(old));
	extension = isl_union_map_copy(extension);
	extension = isl_union_map_intersect_range(extension, domain);
	empty = isl_union_map_is_empty(extension);
	isl_union_map_free(extension);

	return empty;
}

/* Given a node "node" that is governed by an extension node, extend
 * that extension node with "extension".
 *
 * In particular, "node" is the child of a filter in a sequence that
 * is in turn a child of an extension node.  Extend that extension node
 * with "extension".
 *
 * Return a pointer to the parent of the original node (i.e., a filter).
 */
static __isl_give isl_schedule_node *extend_extension(
	__isl_take isl_schedule_node *node, __isl_take isl_union_map *extension)
{
	isl_size pos;
	isl_bool disjoint;
	isl_union_map *node_extension;

	node = isl_schedule_node_parent(node);
	pos = isl_schedule_node_get_child_position(node);
	if (pos < 0)
		node = isl_schedule_node_free(node);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);
	node_extension = isl_schedule_node_extension_get_extension(node);
	disjoint = isl_union_map_is_disjoint(extension, node_extension);
	extension = isl_union_map_union(extension, node_extension);
	node = isl_schedule_node_extension_set_extension(node, extension);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, pos);

	if (disjoint < 0)
		return isl_schedule_node_free(node);
	if (!node)
		return NULL;
	if (!disjoint)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"extension domain should be disjoint from earlier "
			"extensions", return isl_schedule_node_free(node));

	return node;
}

/* Return the universe of "uset" if this universe is disjoint from "ref".
 * Otherwise, return "uset".
 *
 * Also check if "uset" itself is disjoint from "ref", reporting
 * an error if it is not.
 */
static __isl_give isl_union_set *replace_by_universe_if_disjoint(
	__isl_take isl_union_set *uset, __isl_keep isl_union_set *ref)
{
	int disjoint;
	isl_union_set *universe;

	disjoint = isl_union_set_is_disjoint(uset, ref);
	if (disjoint < 0)
		return isl_union_set_free(uset);
	if (!disjoint)
		isl_die(isl_union_set_get_ctx(uset), isl_error_invalid,
			"extension domain should be disjoint from "
			"current domain", return isl_union_set_free(uset));

	universe = isl_union_set_universe(isl_union_set_copy(uset));
	disjoint = isl_union_set_is_disjoint(universe, ref);
	if (disjoint >= 0 && disjoint) {
		isl_union_set_free(uset);
		return universe;
	}
	isl_union_set_free(universe);

	if (disjoint < 0)
		return isl_union_set_free(uset);
	return uset;
}

/* Insert an extension node on top of "node" with extension "extension".
 * In addition, insert a filter that separates node from the extension
 * between the extension node and "node".
 * Return a pointer to the inserted filter node.
 *
 * If "node" already appears in an extension (i.e., if it is the child
 * of a filter in a sequence inside an extension node), then extend that
 * extension with "extension" instead.
 * In this case, a pointer to the original filter node is returned.
 * Note that if some of the elements in the new extension live in the
 * same space as those of the original extension or the domain elements
 * reaching the original extension, then we insert a new extension anyway.
 * Otherwise, we would have to adjust the filters in the sequence child
 * of the extension to ensure that the elements in the new extension
 * are filtered out.
 */
static __isl_give isl_schedule_node *insert_extension(
	__isl_take isl_schedule_node *node, __isl_take isl_union_map *extension)
{
	enum isl_schedule_node_type ancestors[] =
		{ isl_schedule_node_filter, isl_schedule_node_sequence,
		  isl_schedule_node_extension };
	isl_union_set *domain;
	isl_union_set *filter;
	isl_bool in_ext;

	in_ext = has_ancestors(node, 3, ancestors);
	if (in_ext < 0)
		goto error;
	if (in_ext) {
		int disjoint;

		disjoint = is_disjoint_extension(node, extension);
		if (disjoint < 0)
			goto error;
		if (disjoint)
			return extend_extension(node, extension);
	}

	filter = isl_schedule_node_get_domain(node);
	domain = isl_union_map_range(isl_union_map_copy(extension));
	filter = replace_by_universe_if_disjoint(filter, domain);
	isl_union_set_free(domain);

	node = isl_schedule_node_insert_filter(node, filter);
	node = isl_schedule_node_insert_extension(node, extension);
	node = isl_schedule_node_child(node, 0);
	return node;
error:
	isl_schedule_node_free(node);
	isl_union_map_free(extension);
	return NULL;
}

/* Replace the subtree that "node" points to by "tree" (which has
 * a sequence root with two children), except if the parent of "node"
 * is a sequence as well, in which case "tree" is spliced at the position
 * of "node" in its parent.
 * Return a pointer to the child of the "tree_pos" (filter) child of "tree"
 * in the updated schedule tree.
 */
static __isl_give isl_schedule_node *graft_or_splice(
	__isl_take isl_schedule_node *node, __isl_take isl_schedule_tree *tree,
	int tree_pos)
{
	isl_size pos;

	if (isl_schedule_node_get_parent_type(node) ==
	    isl_schedule_node_sequence) {
		pos = isl_schedule_node_get_child_position(node);
		if (pos < 0)
			node = isl_schedule_node_free(node);
		node = isl_schedule_node_parent(node);
		node = isl_schedule_node_sequence_splice(node, pos, tree);
	} else {
		pos = 0;
		node = isl_schedule_node_graft_tree(node, tree);
	}
	node = isl_schedule_node_child(node, pos + tree_pos);
	node = isl_schedule_node_child(node, 0);

	return node;
}

/* Insert a node "graft" into the schedule tree of "node" such that it
 * is executed before (if "before" is set) or after (if "before" is not set)
 * the node that "node" points to.
 * The root of "graft" is an extension node.
 * Return a pointer to the node that "node" pointed to.
 *
 * We first insert an extension node on top of "node" (or extend
 * the extension node if there already is one), with a filter on "node"
 * separating it from the extension.
 * We then insert a filter in the graft to separate it from the original
 * domain elements and combine the original and new tree in a sequence.
 * If we have extended an extension node, then the children of this
 * sequence are spliced in the sequence of the extended extension
 * at the position where "node" appears in the original extension.
 * Otherwise, the sequence pair is attached to the new extension node.
 */
static __isl_give isl_schedule_node *graft_extension(
	__isl_take isl_schedule_node *node, __isl_take isl_schedule_node *graft,
	int before)
{
	isl_union_map *extension;
	isl_union_set *graft_domain;
	isl_union_set *node_domain;
	isl_schedule_tree *tree, *tree_graft;

	extension = isl_schedule_node_extension_get_extension(graft);
	graft_domain = isl_union_map_range(isl_union_map_copy(extension));
	node_domain = isl_schedule_node_get_universe_domain(node);
	node = insert_extension(node, extension);

	graft_domain = replace_by_universe_if_disjoint(graft_domain,
							node_domain);
	isl_union_set_free(node_domain);

	tree = isl_schedule_node_get_tree(node);
	if (!isl_schedule_node_has_children(graft)) {
		tree_graft = isl_schedule_tree_from_filter(graft_domain);
	} else {
		graft = isl_schedule_node_child(graft, 0);
		tree_graft = isl_schedule_node_get_tree(graft);
		tree_graft = isl_schedule_tree_insert_filter(tree_graft,
								graft_domain);
	}
	if (before)
		tree = isl_schedule_tree_sequence_pair(tree_graft, tree);
	else
		tree = isl_schedule_tree_sequence_pair(tree, tree_graft);
	node = graft_or_splice(node, tree, before);

	isl_schedule_node_free(graft);

	return node;
}

/* Replace the root domain node of "node" by an extension node suitable
 * for insertion at "pos".
 * That is, create an extension node that maps the outer band nodes
 * at "pos" to the domain of the root node of "node" and attach
 * the child of this root node to the extension node.
 */
static __isl_give isl_schedule_node *extension_from_domain(
	__isl_take isl_schedule_node *node, __isl_keep isl_schedule_node *pos)
{
	isl_union_set *universe;
	isl_union_set *domain;
	isl_union_map *ext;
	isl_size depth;
	isl_bool anchored;
	isl_space *space;
	isl_schedule_node *res;
	isl_schedule_tree *tree;

	depth = isl_schedule_node_get_schedule_depth(pos);
	anchored = isl_schedule_node_is_subtree_anchored(node);
	if (depth < 0 || anchored < 0)
		return isl_schedule_node_free(node);
	if (anchored)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_unsupported,
			"cannot graft anchored tree with domain root",
			return isl_schedule_node_free(node));

	domain = isl_schedule_node_domain_get_domain(node);
	space = isl_union_set_get_space(domain);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, depth);
	universe = isl_union_set_from_set(isl_set_universe(space));
	ext = isl_union_map_from_domain_and_range(universe, domain);
	res = isl_schedule_node_from_extension(ext);
	node = isl_schedule_node_child(node, 0);
	if (!node)
		return isl_schedule_node_free(res);
	if (!isl_schedule_tree_is_leaf(node->tree)) {
		tree = isl_schedule_node_get_tree(node);
		res = isl_schedule_node_child(res, 0);
		res = isl_schedule_node_graft_tree(res, tree);
		res = isl_schedule_node_parent(res);
	}
	isl_schedule_node_free(node);

	return res;
}

/* Insert a node "graft" into the schedule tree of "node" such that it
 * is executed before (if "before" is set) or after (if "before" is not set)
 * the node that "node" points to.
 * The root of "graft" may be either a domain or an extension node.
 * In the latter case, the domain of the extension needs to correspond
 * to the outer band nodes of "node".
 * The elements of the domain or the range of the extension may not
 * intersect with the domain elements that reach "node".
 * The schedule tree of "graft" may not be anchored.
 *
 * The schedule tree of "node" is modified to include an extension node
 * corresponding to the root node of "graft" as a child of the original
 * parent of "node".  The original node that "node" points to and the
 * child of the root node of "graft" are attached to this extension node
 * through a sequence, with appropriate filters and with the child
 * of "graft" appearing before or after the original "node".
 *
 * If "node" already appears inside a sequence that is the child of
 * an extension node and if the spaces of the new domain elements
 * do not overlap with those of the original domain elements,
 * then that extension node is extended with the new extension
 * rather than introducing a new segment of extension and sequence nodes.
 *
 * Return a pointer to the same node in the modified tree that
 * "node" pointed to in the original tree.
 */
static __isl_give isl_schedule_node *isl_schedule_node_graft_before_or_after(
	__isl_take isl_schedule_node *node, __isl_take isl_schedule_node *graft,
	int before)
{
	if (!node || !graft)
		goto error;
	if (check_insert(node) < 0)
		goto error;

	if (isl_schedule_node_get_type(graft) == isl_schedule_node_domain)
		graft = extension_from_domain(graft, node);

	if (!graft)
		goto error;
	if (isl_schedule_node_get_type(graft) != isl_schedule_node_extension)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"expecting domain or extension as root of graft",
			goto error);

	return graft_extension(node, graft, before);
error:
	isl_schedule_node_free(node);
	isl_schedule_node_free(graft);
	return NULL;
}

/* Insert a node "graft" into the schedule tree of "node" such that it
 * is executed before the node that "node" points to.
 * The root of "graft" may be either a domain or an extension node.
 * In the latter case, the domain of the extension needs to correspond
 * to the outer band nodes of "node".
 * The elements of the domain or the range of the extension may not
 * intersect with the domain elements that reach "node".
 * The schedule tree of "graft" may not be anchored.
 *
 * Return a pointer to the same node in the modified tree that
 * "node" pointed to in the original tree.
 */
__isl_give isl_schedule_node *isl_schedule_node_graft_before(
	__isl_take isl_schedule_node *node, __isl_take isl_schedule_node *graft)
{
	return isl_schedule_node_graft_before_or_after(node, graft, 1);
}

/* Insert a node "graft" into the schedule tree of "node" such that it
 * is executed after the node that "node" points to.
 * The root of "graft" may be either a domain or an extension node.
 * In the latter case, the domain of the extension needs to correspond
 * to the outer band nodes of "node".
 * The elements of the domain or the range of the extension may not
 * intersect with the domain elements that reach "node".
 * The schedule tree of "graft" may not be anchored.
 *
 * Return a pointer to the same node in the modified tree that
 * "node" pointed to in the original tree.
 */
__isl_give isl_schedule_node *isl_schedule_node_graft_after(
	__isl_take isl_schedule_node *node,
	__isl_take isl_schedule_node *graft)
{
	return isl_schedule_node_graft_before_or_after(node, graft, 0);
}

/* Split the domain elements that reach "node" into those that satisfy
 * "filter" and those that do not.  Arrange for the first subset to be
 * executed before or after the second subset, depending on the value
 * of "before".
 * Return a pointer to the tree corresponding to the second subset,
 * except when this subset is empty in which case the original pointer
 * is returned.
 * If both subsets are non-empty, then a sequence node is introduced
 * to impose the order.  If the grandparent of the original node was
 * itself a sequence, then the original child is replaced by two children
 * in this sequence instead.
 * The children in the sequence are copies of the original subtree,
 * simplified with respect to their filters.
 */
static __isl_give isl_schedule_node *isl_schedule_node_order_before_or_after(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter,
	int before)
{
	enum isl_schedule_node_type ancestors[] =
		{ isl_schedule_node_filter, isl_schedule_node_sequence };
	isl_union_set *node_domain, *node_filter = NULL, *parent_filter;
	isl_schedule_node *node2;
	isl_schedule_tree *tree1, *tree2;
	isl_bool empty1, empty2;
	isl_bool in_seq;

	if (!node || !filter)
		goto error;
	if (check_insert(node) < 0)
		goto error;

	in_seq = has_ancestors(node, 2, ancestors);
	if (in_seq < 0)
		goto error;
	node_domain = isl_schedule_node_get_domain(node);
	filter = isl_union_set_gist(filter, isl_union_set_copy(node_domain));
	node_filter = isl_union_set_copy(node_domain);
	node_filter = isl_union_set_subtract(node_filter,
						isl_union_set_copy(filter));
	node_filter = isl_union_set_gist(node_filter, node_domain);
	empty1 = isl_union_set_is_empty(filter);
	empty2 = isl_union_set_is_empty(node_filter);
	if (empty1 < 0 || empty2 < 0)
		goto error;
	if (empty1 || empty2) {
		isl_union_set_free(filter);
		isl_union_set_free(node_filter);
		return node;
	}

	if (in_seq) {
		node = isl_schedule_node_parent(node);
		parent_filter = isl_schedule_node_filter_get_filter(node);
		node_filter = isl_union_set_intersect(node_filter,
					    isl_union_set_copy(parent_filter));
		filter = isl_union_set_intersect(filter, parent_filter);
	}

	node2 = isl_schedule_node_copy(node);
	node = isl_schedule_node_gist(node, isl_union_set_copy(node_filter));
	node2 = isl_schedule_node_gist(node2, isl_union_set_copy(filter));
	tree1 = isl_schedule_node_get_tree(node);
	tree2 = isl_schedule_node_get_tree(node2);
	tree1 = isl_schedule_tree_insert_filter(tree1, node_filter);
	tree2 = isl_schedule_tree_insert_filter(tree2, filter);
	isl_schedule_node_free(node2);

	if (before) {
		tree1 = isl_schedule_tree_sequence_pair(tree2, tree1);
		node = graft_or_splice(node, tree1, 1);
	} else {
		tree1 = isl_schedule_tree_sequence_pair(tree1, tree2);
		node = graft_or_splice(node, tree1, 0);
	}

	return node;
error:
	isl_schedule_node_free(node);
	isl_union_set_free(filter);
	isl_union_set_free(node_filter);
	return NULL;
}

/* Split the domain elements that reach "node" into those that satisfy
 * "filter" and those that do not.  Arrange for the first subset to be
 * executed before the second subset.
 * Return a pointer to the tree corresponding to the second subset,
 * except when this subset is empty in which case the original pointer
 * is returned.
 */
__isl_give isl_schedule_node *isl_schedule_node_order_before(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter)
{
	return isl_schedule_node_order_before_or_after(node, filter, 1);
}

/* Split the domain elements that reach "node" into those that satisfy
 * "filter" and those that do not.  Arrange for the first subset to be
 * executed after the second subset.
 * Return a pointer to the tree corresponding to the second subset,
 * except when this subset is empty in which case the original pointer
 * is returned.
 */
__isl_give isl_schedule_node *isl_schedule_node_order_after(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter)
{
	return isl_schedule_node_order_before_or_after(node, filter, 0);
}

/* Reset the user pointer on all identifiers of parameters and tuples
 * in the schedule node "node".
 */
__isl_give isl_schedule_node *isl_schedule_node_reset_user(
	__isl_take isl_schedule_node *node)
{
	isl_schedule_tree *tree;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_reset_user(tree);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Align the parameters of the schedule node "node" to those of "space".
 */
__isl_give isl_schedule_node *isl_schedule_node_align_params(
	__isl_take isl_schedule_node *node, __isl_take isl_space *space)
{
	isl_schedule_tree *tree;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_align_params(tree, space);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Compute the pullback of schedule node "node"
 * by the function represented by "upma".
 * In other words, plug in "upma" in the iteration domains
 * of schedule node "node".
 * We currently do not handle expansion nodes.
 *
 * Note that this is only a helper function for
 * isl_schedule_pullback_union_pw_multi_aff.  In order to maintain consistency,
 * this function should not be called on a single node without also
 * calling it on all the other nodes.
 */
__isl_give isl_schedule_node *isl_schedule_node_pullback_union_pw_multi_aff(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_pw_multi_aff *upma)
{
	isl_schedule_tree *tree;

	tree = isl_schedule_node_get_tree(node);
	tree = isl_schedule_tree_pullback_union_pw_multi_aff(tree, upma);
	node = isl_schedule_node_graft_tree(node, tree);

	return node;
}

/* Internal data structure for isl_schedule_node_expand.
 * "tree" is the tree that needs to be plugged in in all the leaves.
 * "domain" is the set of domain elements in the original leaves
 * to which the tree applies.
 */
struct isl_schedule_expand_data {
	isl_schedule_tree *tree;
	isl_union_set *domain;
};

/* If "node" is a leaf, then plug in data->tree, simplifying it
 * within its new context.
 *
 * If there are any domain elements at the leaf where the tree
 * should not be plugged in (i.e., there are elements not in data->domain)
 * then first extend the tree to only apply to the elements in data->domain
 * by constructing a set node that selects data->tree for elements
 * in data->domain and a leaf for the other elements.
 */
static __isl_give isl_schedule_node *expand(__isl_take isl_schedule_node *node,
	void *user)
{
	struct isl_schedule_expand_data *data = user;
	isl_schedule_tree *tree, *leaf;
	isl_union_set *domain, *left;
	isl_bool empty;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
		return node;

	domain = isl_schedule_node_get_domain(node);
	tree = isl_schedule_tree_copy(data->tree);

	left = isl_union_set_copy(domain);
	left = isl_union_set_subtract(left, isl_union_set_copy(data->domain));
	empty = isl_union_set_is_empty(left);
	if (empty >= 0 && !empty) {
		leaf = isl_schedule_node_get_leaf(node);
		leaf = isl_schedule_tree_insert_filter(leaf, left);
		left = isl_union_set_copy(data->domain);
		tree = isl_schedule_tree_insert_filter(tree, left);
		tree = isl_schedule_tree_set_pair(tree, leaf);
	} else {
		if (empty < 0)
			node = isl_schedule_node_free(node);
		isl_union_set_free(left);
	}

	node = isl_schedule_node_graft_tree(node, tree);
	node = isl_schedule_node_gist(node, domain);

	return node;
}

/* Expand the tree rooted at "node" by extending all leaves
 * with an expansion node with as child "tree".
 * The expansion is determined by "contraction" and "domain".
 * That is, the elements of "domain" are contracted according
 * to "contraction".  The expansion relation is then the inverse
 * of "contraction" with its range intersected with "domain".
 *
 * Insert the appropriate expansion node on top of "tree" and
 * then plug in the result in all leaves of "node".
 */
__isl_give isl_schedule_node *isl_schedule_node_expand(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_set *domain,
	__isl_take isl_schedule_tree *tree)
{
	struct isl_schedule_expand_data data;
	isl_union_map *expansion;
	isl_union_pw_multi_aff *copy;

	if (!node || !contraction || !tree)
		node = isl_schedule_node_free(node);

	copy = isl_union_pw_multi_aff_copy(contraction);
	expansion = isl_union_map_from_union_pw_multi_aff(copy);
	expansion = isl_union_map_reverse(expansion);
	expansion = isl_union_map_intersect_range(expansion, domain);
	data.domain = isl_union_map_domain(isl_union_map_copy(expansion));

	tree = isl_schedule_tree_insert_expansion(tree, contraction, expansion);
	data.tree = tree;

	node = isl_schedule_node_map_descendant_bottom_up(node, &expand, &data);
	isl_union_set_free(data.domain);
	isl_schedule_tree_free(data.tree);
	return node;
}

/* Return the position of the subtree containing "node" among the children
 * of "ancestor".  "node" is assumed to be a descendant of "ancestor".
 * In particular, both nodes should point to the same schedule tree.
 *
 * Return isl_size_error on error.
 */
isl_size isl_schedule_node_get_ancestor_child_position(
	__isl_keep isl_schedule_node *node,
	__isl_keep isl_schedule_node *ancestor)
{
	isl_size n1, n2;
	isl_schedule_tree *tree;

	n1 = isl_schedule_node_get_tree_depth(ancestor);
	n2 = isl_schedule_node_get_tree_depth(node);
	if (n1 < 0 || n2 < 0)
		return isl_size_error;

	if (node->schedule != ancestor->schedule)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a descendant", return isl_size_error);

	if (n1 >= n2)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a descendant", return isl_size_error);
	tree = isl_schedule_tree_list_get_schedule_tree(node->ancestors, n1);
	isl_schedule_tree_free(tree);
	if (tree != ancestor->tree)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"not a descendant", return isl_size_error);

	return node->child_pos[n1];
}

/* Given two nodes that point to the same schedule tree, return their
 * closest shared ancestor.
 *
 * Since the two nodes point to the same schedule, they share at least
 * one ancestor, the root of the schedule.  We move down from the root
 * to the first ancestor where the respective children have a different
 * child position.  This is the requested ancestor.
 * If there is no ancestor where the children have a different position,
 * then one node is an ancestor of the other and then this node is
 * the requested ancestor.
 */
__isl_give isl_schedule_node *isl_schedule_node_get_shared_ancestor(
	__isl_keep isl_schedule_node *node1,
	__isl_keep isl_schedule_node *node2)
{
	int i;
	isl_size n1, n2;

	n1 = isl_schedule_node_get_tree_depth(node1);
	n2 = isl_schedule_node_get_tree_depth(node2);
	if (n1 < 0 || n2 < 0)
		return NULL;
	if (node1->schedule != node2->schedule)
		isl_die(isl_schedule_node_get_ctx(node1), isl_error_invalid,
			"not part of same schedule", return NULL);
	if (n2 < n1)
		return isl_schedule_node_get_shared_ancestor(node2, node1);
	if (n1 == 0)
		return isl_schedule_node_copy(node1);
	if (isl_schedule_node_is_equal(node1, node2))
		return isl_schedule_node_copy(node1);

	for (i = 0; i < n1; ++i)
		if (node1->child_pos[i] != node2->child_pos[i])
			break;

	node1 = isl_schedule_node_copy(node1);
	return isl_schedule_node_ancestor(node1, n1 - i);
}

/* Print "node" to "p".
 */
__isl_give isl_printer *isl_printer_print_schedule_node(
	__isl_take isl_printer *p, __isl_keep isl_schedule_node *node)
{
	isl_size n;

	if (!node)
		return isl_printer_free(p);
	n = isl_schedule_tree_list_n_schedule_tree(node->ancestors);
	if (n < 0)
		return isl_printer_free(p);
	return isl_printer_print_schedule_tree_mark(p, node->schedule->root, n,
			node->child_pos);
}

void isl_schedule_node_dump(__isl_keep isl_schedule_node *node)
{
	isl_ctx *ctx;
	isl_printer *printer;

	if (!node)
		return;

	ctx = isl_schedule_node_get_ctx(node);
	printer = isl_printer_to_file(ctx, stderr);
	printer = isl_printer_set_yaml_style(printer, ISL_YAML_STYLE_BLOCK);
	printer = isl_printer_print_schedule_node(printer, node);

	isl_printer_free(printer);
}

/* Return a string representation of "node".
 * Print the schedule node in block format as it would otherwise
 * look identical to the entire schedule.
 */
__isl_give char *isl_schedule_node_to_str(__isl_keep isl_schedule_node *node)
{
	isl_printer *printer;
	char *s;

	if (!node)
		return NULL;

	printer = isl_printer_to_str(isl_schedule_node_get_ctx(node));
	printer = isl_printer_set_yaml_style(printer, ISL_YAML_STYLE_BLOCK);
	printer = isl_printer_print_schedule_node(printer, node);
	s = isl_printer_get_str(printer);
	isl_printer_free(printer);

	return s;
}

/* AutoSA Extended */
/* Return the space_time property of the band member position "pos" of the 
 * band node "node". 
 */
enum autosa_loop_type isl_schedule_node_band_member_get_space_time(
  __isl_keep isl_schedule_node *node, int pos)
{
  if (!node)
    return autosa_loop_error;
  return isl_schedule_tree_band_member_get_space_time(node->tree, pos);
}

/* Mark the band member at position "pos" of the band node "node"
 * as "loop_type".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_space_time(
  __isl_take isl_schedule_node *node, int pos, enum autosa_loop_type loop_type)
{
  enum autosa_loop_type t;
  isl_schedule_tree *tree;

  if (!node)
    return NULL;
  t = isl_schedule_node_band_member_get_space_time(node, pos);
  if (t == loop_type)
    return node;

  tree = isl_schedule_tree_copy(node->tree);
  tree = isl_schedule_tree_band_member_set_space_time(tree, pos, loop_type);
  node = isl_schedule_node_graft_tree(node, tree);

  return node;
}

/* Return the pe_opt property of the band member position "pos" of the 
 * band node "node". 
 */
enum autosa_loop_type isl_schedule_node_band_member_get_pe_opt(
  __isl_keep isl_schedule_node *node, int pos)
{
  if (!node)
    return autosa_loop_error;
  return isl_schedule_tree_band_member_get_pe_opt(node->tree, pos);
}

/* Mark the band member at position "pos" of the band node "node"
 * as "loop_type".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_pe_opt(
  __isl_take isl_schedule_node *node, int pos, enum autosa_loop_type loop_type)
{
  enum autosa_loop_type t;
  isl_schedule_tree *tree;

  if (!node)
    return NULL;
  t = isl_schedule_node_band_member_get_pe_opt(node, pos);
  if (t == loop_type)
    return node;

  tree = isl_schedule_tree_copy(node->tree);
  tree = isl_schedule_tree_band_member_set_pe_opt(tree, pos, loop_type);
  node = isl_schedule_node_graft_tree(node, tree);

  return node;
}

/* Return the sched_pos property of the band member position "pos" of the 
 * band node "node". 
 */
int isl_schedule_node_band_member_get_sched_pos(
  __isl_keep isl_schedule_node *node, int pos)
{
  if (!node)
    return -1;
  return isl_schedule_tree_band_member_get_sched_pos(node->tree, pos);
}

/* Mark the band member at position "pos" of the band node "node"
 * as "sched_pos".
 */
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_sched_pos(
  __isl_take isl_schedule_node *node, int pos, int sched_pos)
{
  int sp;
  isl_schedule_tree *tree;

  if (!node)
    return NULL;
  sp = isl_schedule_node_band_member_get_sched_pos(node, pos);
  if (sp == sched_pos)
    return node;

  tree = isl_schedule_tree_copy(node->tree);
  tree = isl_schedule_tree_band_member_set_sched_pos(tree, pos, sched_pos);
  node = isl_schedule_node_graft_tree(node, tree);

  return node;
}

void *isl_schedule_node_band_member_get_iter(__isl_keep isl_schedule_node *node, int pos)
{
  if (!node)
	return NULL;
  return isl_schedule_tree_band_member_get_iter(node->tree, pos);
}

__isl_give isl_schedule_node *isl_schedule_node_band_member_set_iter(
  __isl_take isl_schedule_node *node, int pos, void *iter) 
{
  void *it;
  isl_schedule_tree *tree;

  if (!node)
    return NULL;
  it = isl_schedule_node_band_member_get_iter(node, pos);
  if (it == iter)
    return node;

  tree = isl_schedule_tree_copy(node->tree);
  tree = isl_schedule_tree_band_member_set_iter(tree, pos, iter);
  node = isl_schedule_node_graft_tree(node, tree);

  return node;
}
/* AutoSA Extended */

================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule_tree.c
================================================
/*
 * Copyright 2013-2014 Ecole Normale Superieure
 * Copyright 2014      INRIA Rocquencourt
 * Copyright 2016      INRIA Paris
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 * and Inria Paris - Rocquencourt, Domaine de Voluceau - Rocquencourt,
 * B.P. 105 - 78153 Le Chesnay, France
 * and Centre de Recherche Inria de Paris, 2 rue Simone Iff - Voie DQ12,
 * CS 42112, 75589 Paris Cedex 12, France
 */

#include <isl/id.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/map.h>
#include <isl_schedule_band.h>
#include <isl_schedule_private.h>

#undef EL
#define EL isl_schedule_tree

#include <isl_list_templ.h>

#undef EL_BASE
#define EL_BASE schedule_tree

#include <isl_list_templ.c>

/* Is "tree" the leaf of a schedule tree?
 */
int isl_schedule_tree_is_leaf(__isl_keep isl_schedule_tree *tree)
{
	return isl_schedule_tree_get_type(tree) == isl_schedule_node_leaf;
}

/* Create a new schedule tree of type "type".
 * The caller is responsible for filling in the type specific fields and
 * the children.
 *
 * By default, the single node tree does not have any anchored nodes.
 * The caller is responsible for updating the anchored field if needed.
 */
static __isl_give isl_schedule_tree *isl_schedule_tree_alloc(isl_ctx *ctx,
	enum isl_schedule_node_type type)
{
	isl_schedule_tree *tree;

	if (type == isl_schedule_node_error)
		return NULL;

	tree = isl_calloc_type(ctx, isl_schedule_tree);
	if (!tree)
		return NULL;

	tree->ref = 1;
	tree->ctx = ctx;
	isl_ctx_ref(ctx);
	tree->type = type;
	tree->anchored = 0;

	return tree;
}

/* Return a fresh copy of "tree".
 */
__isl_take isl_schedule_tree *isl_schedule_tree_dup(
	__isl_keep isl_schedule_tree *tree)
{
	isl_ctx *ctx;
	isl_schedule_tree *dup;

	if (!tree)
		return NULL;

	ctx = isl_schedule_tree_get_ctx(tree);
	dup = isl_schedule_tree_alloc(ctx, tree->type);
	if (!dup)
		return NULL;

	switch (tree->type) {
	case isl_schedule_node_error:
		isl_die(ctx, isl_error_internal,
			"allocation should have failed",
			return isl_schedule_tree_free(dup));
	case isl_schedule_node_band:
		dup->band = isl_schedule_band_copy(tree->band);
		if (!dup->band)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_context:
		dup->context = isl_set_copy(tree->context);
		if (!dup->context)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_domain:
		dup->domain = isl_union_set_copy(tree->domain);
		if (!dup->domain)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_expansion:
		dup->contraction =
			isl_union_pw_multi_aff_copy(tree->contraction);
		dup->expansion = isl_union_map_copy(tree->expansion);
		if (!dup->contraction || !dup->expansion)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_extension:
		dup->extension = isl_union_map_copy(tree->extension);
		if (!dup->extension)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_filter:
		dup->filter = isl_union_set_copy(tree->filter);
		if (!dup->filter)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_guard:
		dup->guard = isl_set_copy(tree->guard);
		if (!dup->guard)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_mark:
		dup->mark = isl_id_copy(tree->mark);
		if (!dup->mark)
			return isl_schedule_tree_free(dup);
		break;
	case isl_schedule_node_leaf:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	}

	if (tree->children) {
		dup->children = isl_schedule_tree_list_copy(tree->children);
		if (!dup->children)
			return isl_schedule_tree_free(dup);
	}
	dup->anchored = tree->anchored;

	return dup;
}

/* Return an isl_schedule_tree that is equal to "tree" and that has only
 * a single reference.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_cow(
	__isl_take isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->ref == 1)
		return tree;
	tree->ref--;
	return isl_schedule_tree_dup(tree);
}

/* Return a new reference to "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_copy(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	tree->ref++;
	return tree;
}

/* Free "tree" and return NULL.
 */
__isl_null isl_schedule_tree *isl_schedule_tree_free(
	__isl_take isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;
	if (--tree->ref > 0)
		return NULL;

	switch (tree->type) {
	case isl_schedule_node_band:
		isl_schedule_band_free(tree->band);
		break;
	case isl_schedule_node_context:
		isl_set_free(tree->context);
		break;
	case isl_schedule_node_domain:
		isl_union_set_free(tree->domain);
		break;
	case isl_schedule_node_expansion:
		isl_union_pw_multi_aff_free(tree->contraction);
		isl_union_map_free(tree->expansion);
		break;
	case isl_schedule_node_extension:
		isl_union_map_free(tree->extension);
		break;
	case isl_schedule_node_filter:
		isl_union_set_free(tree->filter);
		break;
	case isl_schedule_node_guard:
		isl_set_free(tree->guard);
		break;
	case isl_schedule_node_mark:
		isl_id_free(tree->mark);
		break;
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
	case isl_schedule_node_error:
	case isl_schedule_node_leaf:
		break;
	}
	isl_schedule_tree_list_free(tree->children);
	isl_ctx_deref(tree->ctx);
	free(tree);

	return NULL;
}

/* Create and return a new leaf schedule tree.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_leaf(isl_ctx *ctx)
{
	return isl_schedule_tree_alloc(ctx, isl_schedule_node_leaf);
}

/* Create a new band schedule tree referring to "band"
 * with no children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_band(
	__isl_take isl_schedule_band *band)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!band)
		return NULL;

	ctx = isl_schedule_band_get_ctx(band);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_band);
	if (!tree)
		goto error;

	tree->band = band;
	tree->anchored = isl_schedule_band_is_anchored(band);

	return tree;
error:
	isl_schedule_band_free(band);
	return NULL;
}

/* Create a new context schedule tree with the given context and no children.
 * Since the context references the outer schedule dimension,
 * the tree is anchored.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_context(
	__isl_take isl_set *context)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!context)
		return NULL;

	ctx = isl_set_get_ctx(context);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_context);
	if (!tree)
		goto error;

	tree->context = context;
	tree->anchored = 1;

	return tree;
error:
	isl_set_free(context);
	return NULL;
}

/* Create a new domain schedule tree with the given domain and no children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_domain(
	__isl_take isl_union_set *domain)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!domain)
		return NULL;

	ctx = isl_union_set_get_ctx(domain);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_domain);
	if (!tree)
		goto error;

	tree->domain = domain;

	return tree;
error:
	isl_union_set_free(domain);
	return NULL;
}

/* Create a new expansion schedule tree with the given contraction and
 * expansion and no children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_expansion(
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!contraction || !expansion)
		goto error;

	ctx = isl_union_map_get_ctx(expansion);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_expansion);
	if (!tree)
		goto error;

	tree->contraction = contraction;
	tree->expansion = expansion;

	return tree;
error:
	isl_union_pw_multi_aff_free(contraction);
	isl_union_map_free(expansion);
	return NULL;
}

/* Create a new extension schedule tree with the given extension and
 * no children.
 * Since the domain of the extension refers to the outer schedule dimension,
 * the tree is anchored.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_extension(
	__isl_take isl_union_map *extension)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!extension)
		return NULL;

	ctx = isl_union_map_get_ctx(extension);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_extension);
	if (!tree)
		goto error;

	tree->extension = extension;
	tree->anchored = 1;

	return tree;
error:
	isl_union_map_free(extension);
	return NULL;
}

/* Create a new filter schedule tree with the given filter and no children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_filter(
	__isl_take isl_union_set *filter)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!filter)
		return NULL;

	ctx = isl_union_set_get_ctx(filter);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_filter);
	if (!tree)
		goto error;

	tree->filter = filter;

	return tree;
error:
	isl_union_set_free(filter);
	return NULL;
}

/* Create a new guard schedule tree with the given guard and no children.
 * Since the guard references the outer schedule dimension,
 * the tree is anchored.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_guard(
	__isl_take isl_set *guard)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!guard)
		return NULL;

	ctx = isl_set_get_ctx(guard);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_guard);
	if (!tree)
		goto error;

	tree->guard = guard;
	tree->anchored = 1;

	return tree;
error:
	isl_set_free(guard);
	return NULL;
}

/* Create a new mark schedule tree with the given mark identifier and
 * no children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_mark(
	__isl_take isl_id *mark)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!mark)
		return NULL;

	ctx = isl_id_get_ctx(mark);
	tree = isl_schedule_tree_alloc(ctx, isl_schedule_node_mark);
	if (!tree)
		goto error;

	tree->mark = mark;

	return tree;
error:
	isl_id_free(mark);
	return NULL;
}

/* Does "tree" have any node that depends on its position
 * in the complete schedule tree?
 */
isl_bool isl_schedule_tree_is_subtree_anchored(
	__isl_keep isl_schedule_tree *tree)
{
	return tree ? isl_bool_ok(tree->anchored) : isl_bool_error;
}

/* Does the root node of "tree" depend on its position in the complete
 * schedule tree?
 * Band nodes may be anchored depending on the associated AST build options.
 * Context, extension and guard nodes are always anchored.
 */
int isl_schedule_tree_is_anchored(__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return -1;

	switch (isl_schedule_tree_get_type(tree)) {
	case isl_schedule_node_error:
		return -1;
	case isl_schedule_node_band:
		return isl_schedule_band_is_anchored(tree->band);
	case isl_schedule_node_context:
	case isl_schedule_node_extension:
	case isl_schedule_node_guard:
		return 1;
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_filter:
	case isl_schedule_node_leaf:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		return 0;
	}

	isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
		"unhandled case", return -1);
}

/* Update the anchored field of "tree" based on whether the root node
 * itself in anchored and the anchored fields of the children.
 *
 * This function should be called whenever the children of a tree node
 * are changed or the anchoredness of the tree root itself changes.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_update_anchored(
	__isl_take isl_schedule_tree *tree)
{
	int i;
	isl_size n;
	int anchored;

	anchored = isl_schedule_tree_is_anchored(tree);
	n = isl_schedule_tree_n_children(tree);
	if (anchored < 0 || n < 0)
		return isl_schedule_tree_free(tree);

	for (i = 0; !anchored && i < n; ++i) {
		isl_schedule_tree *child;

		child = isl_schedule_tree_get_child(tree, i);
		if (!child)
			return isl_schedule_tree_free(tree);
		anchored = child->anchored;
		isl_schedule_tree_free(child);
	}

	if (anchored == tree->anchored)
		return tree;
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;
	tree->anchored = anchored;
	return tree;
}

/* Create a new tree of the given type (isl_schedule_node_sequence or
 * isl_schedule_node_set) with the given children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_children(
	enum isl_schedule_node_type type,
	__isl_take isl_schedule_tree_list *list)
{
	isl_ctx *ctx;
	isl_schedule_tree *tree;

	if (!list)
		return NULL;

	ctx = isl_schedule_tree_list_get_ctx(list);
	tree = isl_schedule_tree_alloc(ctx, type);
	if (!tree)
		goto error;

	tree->children = list;
	tree = isl_schedule_tree_update_anchored(tree);

	return tree;
error:
	isl_schedule_tree_list_free(list);
	return NULL;
}

/* Construct a tree with a root node of type "type" and as children
 * "tree1" and "tree2".
 * If the root of one (or both) of the input trees is itself of type "type",
 * then the tree is replaced by its children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_from_pair(
	enum isl_schedule_node_type type, __isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2)
{
	isl_ctx *ctx;
	isl_schedule_tree_list *list;

	if (!tree1 || !tree2)
		goto error;

	ctx = isl_schedule_tree_get_ctx(tree1);
	if (isl_schedule_tree_get_type(tree1) == type) {
		list = isl_schedule_tree_list_copy(tree1->children);
		isl_schedule_tree_free(tree1);
	} else {
		list = isl_schedule_tree_list_alloc(ctx, 2);
		list = isl_schedule_tree_list_add(list, tree1);
	}
	if (isl_schedule_tree_get_type(tree2) == type) {
		isl_schedule_tree_list *children;

		children = isl_schedule_tree_list_copy(tree2->children);
		list = isl_schedule_tree_list_concat(list, children);
		isl_schedule_tree_free(tree2);
	} else {
		list = isl_schedule_tree_list_add(list, tree2);
	}

	return isl_schedule_tree_from_children(type, list);
error:
	isl_schedule_tree_free(tree1);
	isl_schedule_tree_free(tree2);
	return NULL;
}

/* Construct a tree with a sequence root node and as children
 * "tree1" and "tree2".
 * If the root of one (or both) of the input trees is itself a sequence,
 * then the tree is replaced by its children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_sequence_pair(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2)
{
	return isl_schedule_tree_from_pair(isl_schedule_node_sequence,
						tree1, tree2);
}

/* Construct a tree with a set root node and as children
 * "tree1" and "tree2".
 * If the root of one (or both) of the input trees is itself a set,
 * then the tree is replaced by its children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_set_pair(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2)
{
	return isl_schedule_tree_from_pair(isl_schedule_node_set, tree1, tree2);
}

/* Return the isl_ctx to which "tree" belongs.
 */
isl_ctx *isl_schedule_tree_get_ctx(__isl_keep isl_schedule_tree *tree)
{
	return tree ? tree->ctx : NULL;
}

/* Return the type of the root of the tree or isl_schedule_node_error
 * on error.
 */
enum isl_schedule_node_type isl_schedule_tree_get_type(
	__isl_keep isl_schedule_tree *tree)
{
	return tree ? tree->type : isl_schedule_node_error;
}

/* Are "tree1" and "tree2" obviously equal to each other?
 */
isl_bool isl_schedule_tree_plain_is_equal(__isl_keep isl_schedule_tree *tree1,
	__isl_keep isl_schedule_tree *tree2)
{
	isl_bool equal;
	int i;
	isl_size n1, n2;

	if (!tree1 || !tree2)
		return isl_bool_error;
	if (tree1 == tree2)
		return isl_bool_true;
	if (tree1->type != tree2->type)
		return isl_bool_false;

	switch (tree1->type) {
	case isl_schedule_node_band:
		equal = isl_schedule_band_plain_is_equal(tree1->band,
							tree2->band);
		break;
	case isl_schedule_node_context:
		equal = isl_set_is_equal(tree1->context, tree2->context);
		break;
	case isl_schedule_node_domain:
		equal = isl_union_set_is_equal(tree1->domain, tree2->domain);
		break;
	case isl_schedule_node_expansion:
		equal = isl_union_map_is_equal(tree1->expansion,
						tree2->expansion);
		if (equal >= 0 && equal)
			equal = isl_union_pw_multi_aff_plain_is_equal(
				    tree1->contraction, tree2->contraction);
		break;
	case isl_schedule_node_extension:
		equal = isl_union_map_is_equal(tree1->extension,
						tree2->extension);
		break;
	case isl_schedule_node_filter:
		equal = isl_union_set_is_equal(tree1->filter, tree2->filter);
		break;
	case isl_schedule_node_guard:
		equal = isl_set_is_equal(tree1->guard, tree2->guard);
		break;
	case isl_schedule_node_mark:
		equal = isl_bool_ok(tree1->mark == tree2->mark);
		break;
	case isl_schedule_node_leaf:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		equal = isl_bool_true;
		break;
	case isl_schedule_node_error:
		equal = isl_bool_error;
		break;
	}

	if (equal < 0 || !equal)
		return equal;

	n1 = isl_schedule_tree_n_children(tree1);
	n2 = isl_schedule_tree_n_children(tree2);
	if (n1 < 0 || n2 < 0)
		return isl_bool_error;
	if (n1 != n2)
		return isl_bool_false;
	for (i = 0; i < n1; ++i) {
		isl_schedule_tree *child1, *child2;

		child1 = isl_schedule_tree_get_child(tree1, i);
		child2 = isl_schedule_tree_get_child(tree2, i);
		equal = isl_schedule_tree_plain_is_equal(child1, child2);
		isl_schedule_tree_free(child1);
		isl_schedule_tree_free(child2);

		if (equal < 0 || !equal)
			return equal;
	}

	return isl_bool_true;
}

/* Does "tree" have any children, other than an implicit leaf.
 */
int isl_schedule_tree_has_children(__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return -1;

	return tree->children != NULL;
}

/* Return the number of children of "tree", excluding implicit leaves.
 * The "children" field is NULL if there are
 * no children (except for the implicit leaves).
 */
isl_size isl_schedule_tree_n_children(__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return isl_size_error;

	if (!tree->children)
		return 0;
	return isl_schedule_tree_list_n_schedule_tree(tree->children);
}

/* Return a copy of the (explicit) child at position "pos" of "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_get_child(
	__isl_keep isl_schedule_tree *tree, int pos)
{
	if (!tree)
		return NULL;
	if (!tree->children)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"schedule tree has no explicit children", return NULL);
	return isl_schedule_tree_list_get_schedule_tree(tree->children, pos);
}

/* Return a copy of the (explicit) child at position "pos" of "tree" and
 * free "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_child(
	__isl_take isl_schedule_tree *tree, int pos)
{
	isl_schedule_tree *child;

	child = isl_schedule_tree_get_child(tree, pos);
	isl_schedule_tree_free(tree);
	return child;
}

/* Remove all (explicit) children from "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_reset_children(
	__isl_take isl_schedule_tree *tree)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;
	tree->children = isl_schedule_tree_list_free(tree->children);
	return tree;
}

/* Remove the child at position "pos" from the children of "tree".
 * If there was only one child to begin with, then remove all children.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_drop_child(
	__isl_take isl_schedule_tree *tree, int pos)
{
	isl_size n;

	tree = isl_schedule_tree_cow(tree);

	n = isl_schedule_tree_n_children(tree);
	if (n < 0)
		return isl_schedule_tree_free(tree);
	if (n == 0)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"tree does not have any explicit children",
			return isl_schedule_tree_free(tree));
	if (pos < 0 || pos >= n)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"position out of bounds",
			return isl_schedule_tree_free(tree));
	if (n == 1)
		return isl_schedule_tree_reset_children(tree);

	tree->children = isl_schedule_tree_list_drop(tree->children, pos, 1);
	if (!tree->children)
		return isl_schedule_tree_free(tree);

	return tree;
}

/* Replace the child at position "pos" of "tree" by "child".
 *
 * If the new child is a leaf, then it is not explicitly
 * recorded in the list of children.  Instead, the list of children
 * (which is assumed to have only one element) is removed.
 * Note that the children of set and sequence nodes are always
 * filters, so they cannot be replaced by empty trees.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_replace_child(
	__isl_take isl_schedule_tree *tree, int pos,
	__isl_take isl_schedule_tree *child)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !child)
		goto error;

	if (isl_schedule_tree_is_leaf(child)) {
		isl_size n;

		isl_schedule_tree_free(child);
		if (!tree->children && pos == 0)
			return tree;
		n = isl_schedule_tree_n_children(tree);
		if (n < 0)
			return isl_schedule_tree_free(tree);
		if (n != 1)
			isl_die(isl_schedule_tree_get_ctx(tree),
				isl_error_internal,
				"can only replace single child by leaf",
				goto error);
		return isl_schedule_tree_reset_children(tree);
	}

	if (!tree->children && pos == 0)
		tree->children =
			isl_schedule_tree_list_from_schedule_tree(child);
	else
		tree->children = isl_schedule_tree_list_set_schedule_tree(
				tree->children, pos, child);

	if (!tree->children)
		return isl_schedule_tree_free(tree);
	tree = isl_schedule_tree_update_anchored(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_schedule_tree_free(child);
	return NULL;
}

/* Replace the (explicit) children of "tree" by "children"?
 */
__isl_give isl_schedule_tree *isl_schedule_tree_set_children(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_schedule_tree_list *children)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !children)
		goto error;
	isl_schedule_tree_list_free(tree->children);
	tree->children = children;
	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_schedule_tree_list_free(children);
	return NULL;
}

/* Create a new band schedule tree referring to "band"
 * with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_band(
	__isl_take isl_schedule_tree *tree, __isl_take isl_schedule_band *band)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_band(band);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new context schedule tree with the given context and
 * with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_context(
	__isl_take isl_schedule_tree *tree, __isl_take isl_set *context)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_context(context);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new domain schedule tree with the given domain and
 * with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_domain(domain);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new expansion schedule tree with the given contraction and
 * expansion and with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_expansion(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_expansion(contraction, expansion);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new extension schedule tree with the given extension and
 * with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_extension(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_map *extension)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_extension(extension);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new filter schedule tree with the given filter and single child.
 *
 * If the root of "tree" is itself a filter node, then the two
 * filter nodes are merged into one node.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter)
{
	isl_schedule_tree *res;

	if (isl_schedule_tree_get_type(tree) == isl_schedule_node_filter) {
		isl_union_set *tree_filter;

		tree_filter = isl_schedule_tree_filter_get_filter(tree);
		tree_filter = isl_union_set_intersect(tree_filter, filter);
		tree = isl_schedule_tree_filter_set_filter(tree, tree_filter);
		return tree;
	}

	res = isl_schedule_tree_from_filter(filter);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Insert a filter node with filter set "filter"
 * in each of the children of "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_children_insert_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter)
{
	int i;
	isl_size n;

	n = isl_schedule_tree_n_children(tree);
	if (n < 0 || !filter)
		goto error;

	for (i = 0; i < n; ++i) {
		isl_schedule_tree *child;

		child = isl_schedule_tree_get_child(tree, i);
		child = isl_schedule_tree_insert_filter(child,
						    isl_union_set_copy(filter));
		tree = isl_schedule_tree_replace_child(tree, i, child);
	}

	isl_union_set_free(filter);
	return tree;
error:
	isl_union_set_free(filter);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Create a new guard schedule tree with the given guard and
 * with "tree" as single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_guard(
	__isl_take isl_schedule_tree *tree, __isl_take isl_set *guard)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_guard(guard);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Create a new mark schedule tree with the given mark identifier and
 * single child.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_insert_mark(
	__isl_take isl_schedule_tree *tree, __isl_take isl_id *mark)
{
	isl_schedule_tree *res;

	res = isl_schedule_tree_from_mark(mark);
	return isl_schedule_tree_replace_child(res, 0, tree);
}

/* Return the number of members in the band tree root.
 */
isl_size isl_schedule_tree_band_n_member(__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return isl_size_error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_size_error);

	return isl_schedule_band_n_member(tree->band);
}

/* Is the band member at position "pos" of the band tree root
 * marked coincident?
 */
isl_bool isl_schedule_tree_band_member_get_coincident(
	__isl_keep isl_schedule_tree *tree, int pos)
{
	if (!tree)
		return isl_bool_error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_bool_error);

	return isl_schedule_band_member_get_coincident(tree->band, pos);
}

/* Mark the given band member as being coincident or not
 * according to "coincident".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_coincident(
	__isl_take isl_schedule_tree *tree, int pos, int coincident)
{
	if (!tree)
		return NULL;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_schedule_tree_free(tree));
	if (isl_schedule_tree_band_member_get_coincident(tree, pos) ==
								    coincident)
		return tree;
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;

	tree->band = isl_schedule_band_member_set_coincident(tree->band, pos,
							coincident);
	if (!tree->band)
		return isl_schedule_tree_free(tree);
	return tree;
}

/* Is the band tree root marked permutable?
 */
isl_bool isl_schedule_tree_band_get_permutable(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return isl_bool_error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_bool_error);

	return isl_schedule_band_get_permutable(tree->band);
}

/* Mark the band tree root permutable or not according to "permutable"?
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_permutable(
	__isl_take isl_schedule_tree *tree, int permutable)
{
	if (!tree)
		return NULL;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_schedule_tree_free(tree));
	if (isl_schedule_tree_band_get_permutable(tree) == permutable)
		return tree;
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;

	tree->band = isl_schedule_band_set_permutable(tree->band, permutable);
	if (!tree->band)
		return isl_schedule_tree_free(tree);
	return tree;
}

/* Return the schedule space of the band tree root.
 */
__isl_give isl_space *isl_schedule_tree_band_get_space(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return NULL);

	return isl_schedule_band_get_space(tree->band);
}

/* Intersect the domain of the band schedule of the band tree root
 * with "domain".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_intersect_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain)
{
	if (!tree || !domain)
		goto error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	tree->band = isl_schedule_band_intersect_domain(tree->band, domain);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_set_free(domain);
	return NULL;
}

/* Return the schedule of the band tree root in isolation.
 */
__isl_give isl_multi_union_pw_aff *isl_schedule_tree_band_get_partial_schedule(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return NULL);

	return isl_schedule_band_get_partial_schedule(tree->band);
}

/* Replace the schedule of the band tree root by "schedule".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_partial_schedule(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_multi_union_pw_aff *schedule)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !schedule)
		goto error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return NULL);
	tree->band = isl_schedule_band_set_partial_schedule(tree->band,
								schedule);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_multi_union_pw_aff_free(schedule);
	return NULL;
}

/* Return the loop AST generation type for the band member
 * of the band tree root at position "pos".
 */
enum isl_ast_loop_type isl_schedule_tree_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_tree *tree, int pos)
{
	if (!tree)
		return isl_ast_loop_error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_ast_loop_error);

	return isl_schedule_band_member_get_ast_loop_type(tree->band, pos);
}

/* Set the loop AST generation type for the band member of the band tree root
 * at position "pos" to "type".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_ast_loop_type(
	__isl_take isl_schedule_tree *tree, int pos,
	enum isl_ast_loop_type type)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_schedule_tree_free(tree));

	tree->band = isl_schedule_band_member_set_ast_loop_type(tree->band,
								pos, type);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
}

/* Return the loop AST generation type for the band member
 * of the band tree root at position "pos" for the isolated part.
 */
enum isl_ast_loop_type isl_schedule_tree_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_tree *tree, int pos)
{
	if (!tree)
		return isl_ast_loop_error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_ast_loop_error);

	return isl_schedule_band_member_get_isolate_ast_loop_type(tree->band,
									pos);
}

/* Set the loop AST generation type for the band member of the band tree root
 * at position "pos" for the isolated part to "type".
 */
__isl_give isl_schedule_tree *
isl_schedule_tree_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_tree *tree, int pos,
	enum isl_ast_loop_type type)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_schedule_tree_free(tree));

	tree->band = isl_schedule_band_member_set_isolate_ast_loop_type(
							tree->band, pos, type);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
}

/* Return the AST build options associated to the band tree root.
 */
__isl_give isl_union_set *isl_schedule_tree_band_get_ast_build_options(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return NULL);

	return isl_schedule_band_get_ast_build_options(tree->band);
}

/* Replace the AST build options associated to band tree root by "options".
 * Updated the anchored field if the anchoredness of the root node itself
 * changes.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_ast_build_options(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *options)
{
	int was_anchored;

	tree = isl_schedule_tree_cow(tree);
	if (!tree || !options)
		goto error;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	was_anchored = isl_schedule_tree_is_anchored(tree);
	tree->band = isl_schedule_band_set_ast_build_options(tree->band,
								options);
	if (!tree->band)
		return isl_schedule_tree_free(tree);
	if (isl_schedule_tree_is_anchored(tree) != was_anchored)
		tree = isl_schedule_tree_update_anchored(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_set_free(options);
	return NULL;
}

/* Return the "isolate" option associated to the band tree root of "tree",
 * which is assumed to appear at schedule depth "depth".
 */
__isl_give isl_set *isl_schedule_tree_band_get_ast_isolate_option(
	__isl_keep isl_schedule_tree *tree, int depth)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return NULL);

	return isl_schedule_band_get_ast_isolate_option(tree->band, depth);
}

/* Return the context of the context tree root.
 */
__isl_give isl_set *isl_schedule_tree_context_get_context(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_context)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a context node", return NULL);

	return isl_set_copy(tree->context);
}

/* Return the domain of the domain tree root.
 */
__isl_give isl_union_set *isl_schedule_tree_domain_get_domain(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_domain)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a domain node", return NULL);

	return isl_union_set_copy(tree->domain);
}

/* Replace the domain of domain tree root "tree" by "domain".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_domain_set_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !domain)
		goto error;

	if (tree->type != isl_schedule_node_domain)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a domain node", goto error);

	isl_union_set_free(tree->domain);
	tree->domain = domain;

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_set_free(domain);
	return NULL;
}

/* Return the contraction of the expansion tree root.
 */
__isl_give isl_union_pw_multi_aff *isl_schedule_tree_expansion_get_contraction(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_expansion)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not an expansion node", return NULL);

	return isl_union_pw_multi_aff_copy(tree->contraction);
}

/* Return the expansion of the expansion tree root.
 */
__isl_give isl_union_map *isl_schedule_tree_expansion_get_expansion(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_expansion)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not an expansion node", return NULL);

	return isl_union_map_copy(tree->expansion);
}

/* Replace the contraction and the expansion of the expansion tree root "tree"
 * by "contraction" and "expansion".
 */
__isl_give isl_schedule_tree *
isl_schedule_tree_expansion_set_contraction_and_expansion(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !contraction || !expansion)
		goto error;

	if (tree->type != isl_schedule_node_expansion)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not an expansion node", return NULL);

	isl_union_pw_multi_aff_free(tree->contraction);
	tree->contraction = contraction;
	isl_union_map_free(tree->expansion);
	tree->expansion = expansion;

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_pw_multi_aff_free(contraction);
	isl_union_map_free(expansion);
	return NULL;
}

/* Return the extension of the extension tree root.
 */
__isl_give isl_union_map *isl_schedule_tree_extension_get_extension(
	__isl_take isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_extension)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not an extension node", return NULL);

	return isl_union_map_copy(tree->extension);
}

/* Replace the extension of extension tree root "tree" by "extension".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_extension_set_extension(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_map *extension)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !extension)
		goto error;

	if (tree->type != isl_schedule_node_extension)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not an extension node", return NULL);
	isl_union_map_free(tree->extension);
	tree->extension = extension;

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_map_free(extension);
	return NULL;
}

/* Return the filter of the filter tree root.
 */
__isl_give isl_union_set *isl_schedule_tree_filter_get_filter(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_filter)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a filter node", return NULL);

	return isl_union_set_copy(tree->filter);
}

/* Replace the filter of the filter tree root by "filter".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_filter_set_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter)
{
	tree = isl_schedule_tree_cow(tree);
	if (!tree || !filter)
		goto error;

	if (tree->type != isl_schedule_node_filter)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a filter node", return NULL);

	isl_union_set_free(tree->filter);
	tree->filter = filter;

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_union_set_free(filter);
	return NULL;
}

/* Return the guard of the guard tree root.
 */
__isl_give isl_set *isl_schedule_tree_guard_get_guard(
	__isl_take isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_guard)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a guard node", return NULL);

	return isl_set_copy(tree->guard);
}

/* Return the mark identifier of the mark tree root "tree".
 */
__isl_give isl_id *isl_schedule_tree_mark_get_id(
	__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return NULL;

	if (tree->type != isl_schedule_node_mark)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a mark node", return NULL);

	return isl_id_copy(tree->mark);
}

/* Set dim to the range dimension of "map" and abort the search.
 */
static isl_stat set_range_dim(__isl_take isl_map *map, void *user)
{
	isl_size *dim = user;

	*dim = isl_map_dim(map, isl_dim_out);
	isl_map_free(map);

	return isl_stat_error;
}

/* Return the dimension of the range of "umap".
 * "umap" is assumed not to be empty and
 * all maps inside "umap" are assumed to have the same range.
 *
 * We extract the range dimension from the first map in "umap".
 */
static isl_size range_dim(__isl_keep isl_union_map *umap)
{
	isl_size dim = isl_size_error;
	isl_size n;

	n = isl_union_map_n_map(umap);
	if (n < 0)
		return isl_size_error;
	if (n == 0)
		isl_die(isl_union_map_get_ctx(umap), isl_error_internal,
			"unexpected empty input", return isl_size_error);

	isl_union_map_foreach_map(umap, &set_range_dim, &dim);

	return dim;
}

/* Append an "extra" number of zeros to the range of "umap" and
 * return the result.
 */
static __isl_give isl_union_map *append_range(__isl_take isl_union_map *umap,
	int extra)
{
	isl_union_set *dom;
	isl_space *space;
	isl_multi_val *mv;
	isl_union_pw_multi_aff *suffix;
	isl_union_map *universe;
	isl_union_map *suffix_umap;

	universe = isl_union_map_universe(isl_union_map_copy(umap));
	dom = isl_union_map_domain(universe);
	space = isl_union_set_get_space(dom);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, extra);
	mv = isl_multi_val_zero(space);

	suffix = isl_union_pw_multi_aff_multi_val_on_domain(dom, mv);
	suffix_umap = isl_union_map_from_union_pw_multi_aff(suffix);
	umap = isl_union_map_flat_range_product(umap, suffix_umap);

	return umap;
}

/* Should we skip the root of "tree" while looking for the first
 * descendant with schedule information?
 * That is, is it impossible to derive any information about
 * the iteration domain from this node?
 *
 * We do not want to skip leaf or error nodes because there is
 * no point in looking any deeper from these nodes.
 * We can only extract partial iteration domain information
 * from an extension node, but extension nodes are not supported
 * by the caller and it will error out on them.
 */
static isl_bool domain_less(__isl_keep isl_schedule_tree *tree)
{
	enum isl_schedule_node_type type;
	isl_size n;

	type = isl_schedule_tree_get_type(tree);
	switch (type) {
	case isl_schedule_node_band:
		n = isl_schedule_tree_band_n_member(tree);
		return n < 0 ? isl_bool_error : isl_bool_ok(n == 0);
	case isl_schedule_node_context:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
		return isl_bool_true;
	case isl_schedule_node_leaf:
	case isl_schedule_node_error:
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_extension:
	case isl_schedule_node_filter:
	case isl_schedule_node_set:
	case isl_schedule_node_sequence:
		return isl_bool_false;
	}

	isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
		"unhandled case", return isl_bool_error);
}

/* Move down to the first descendant of "tree" that contains any schedule
 * information or return "leaf" if there is no such descendant.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_first_schedule_descendant(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_tree *leaf)
{
	isl_bool down;

	while ((down = domain_less(tree)) == isl_bool_true) {
		if (!isl_schedule_tree_has_children(tree)) {
			isl_schedule_tree_free(tree);
			return isl_schedule_tree_copy(leaf);
		}
		tree = isl_schedule_tree_child(tree, 0);
	}

	if (down < 0)
		return isl_schedule_tree_free(tree);

	return tree;
}

static __isl_give isl_union_map *subtree_schedule_extend(
	__isl_keep isl_schedule_tree *tree, __isl_take isl_union_map *outer);

/* Extend the schedule map "outer" with the subtree schedule
 * of the (single) child of "tree", if any.
 *
 * If "tree" does not have any descendants (apart from those that
 * do not carry any schedule information), then we simply return "outer".
 * Otherwise, we extend the schedule map "outer" with the subtree schedule
 * of the single child.
 */
static __isl_give isl_union_map *subtree_schedule_extend_child(
	__isl_keep isl_schedule_tree *tree, __isl_take isl_union_map *outer)
{
	isl_schedule_tree *child;
	isl_union_map *res;

	if (!tree)
		return isl_union_map_free(outer);
	if (!isl_schedule_tree_has_children(tree))
		return outer;
	child = isl_schedule_tree_get_child(tree, 0);
	if (!child)
		return isl_union_map_free(outer);
	res = subtree_schedule_extend(child, outer);
	isl_schedule_tree_free(child);
	return res;
}

/* Extract the parameter space from one of the children of "tree",
 * which are assumed to be filters.
 */
static __isl_give isl_space *extract_space_from_filter_child(
	__isl_keep isl_schedule_tree *tree)
{
	isl_space *space;
	isl_union_set *dom;
	isl_schedule_tree *child;

	child = isl_schedule_tree_list_get_schedule_tree(tree->children, 0);
	dom = isl_schedule_tree_filter_get_filter(child);
	space = isl_union_set_get_space(dom);
	isl_union_set_free(dom);
	isl_schedule_tree_free(child);

	return space;
}

/* Extend the schedule map "outer" with the subtree schedule
 * of a set or sequence node.
 *
 * The schedule for the set or sequence node itself is composed of
 * pieces of the form
 *
 *	filter -> []
 *
 * or
 *
 *	filter -> [index]
 *
 * The first form is used if there is only a single child or
 * if the current node is a set node and the schedule_separate_components
 * option is not set.
 *
 * Each of the pieces above is extended with the subtree schedule of
 * the child of the corresponding filter, if any, padded with zeros
 * to ensure that all pieces have the same range dimension.
 */
static __isl_give isl_union_map *subtree_schedule_extend_from_children(
	__isl_keep isl_schedule_tree *tree, __isl_take isl_union_map *outer)
{
	int i;
	isl_size n;
	isl_size dim;
	int separate;
	isl_ctx *ctx;
	isl_val *v = NULL;
	isl_multi_val *mv;
	isl_space *space;
	isl_union_map *umap;

	n = isl_schedule_tree_n_children(tree);
	if (n < 0)
		return isl_union_map_free(outer);
	if (n == 0)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"missing children", return isl_union_map_free(outer));

	ctx = isl_schedule_tree_get_ctx(tree);
	separate = n > 1 && (tree->type == isl_schedule_node_sequence ||
			    isl_options_get_schedule_separate_components(ctx));

	space = isl_space_params_alloc(ctx, 0);

	umap = isl_union_map_empty(isl_space_copy(space));
	space = isl_space_set_from_params(space);
	if (separate) {
		space = isl_space_add_dims(space, isl_dim_set, 1);
		v = isl_val_zero(ctx);
	}
	mv = isl_multi_val_zero(space);

	dim = isl_multi_val_dim(mv, isl_dim_set);
	if (dim < 0)
		umap = isl_union_map_free(umap);
	for (i = 0; i < n; ++i) {
		isl_multi_val *mv_copy;
		isl_union_pw_multi_aff *upma;
		isl_union_map *umap_i;
		isl_union_set *dom;
		isl_schedule_tree *child;
		isl_size dim_i;
		isl_bool empty;

		child = isl_schedule_tree_list_get_schedule_tree(
							tree->children, i);
		dom = isl_schedule_tree_filter_get_filter(child);

		if (separate) {
			mv = isl_multi_val_set_val(mv, 0, isl_val_copy(v));
			v = isl_val_add_ui(v, 1);
		}
		mv_copy = isl_multi_val_copy(mv);
		space = isl_union_set_get_space(dom);
		mv_copy = isl_multi_val_align_params(mv_copy, space);
		upma = isl_union_pw_multi_aff_multi_val_on_domain(dom, mv_copy);
		umap_i = isl_union_map_from_union_pw_multi_aff(upma);
		umap_i = isl_union_map_flat_range_product(
					    isl_union_map_copy(outer), umap_i);
		umap_i = subtree_schedule_extend_child(child, umap_i);
		isl_schedule_tree_free(child);

		empty = isl_union_map_is_empty(umap_i);
		if (empty < 0)
			umap_i = isl_union_map_free(umap_i);
		else if (empty) {
			isl_union_map_free(umap_i);
			continue;
		}

		dim_i = range_dim(umap_i);
		if (dim_i < 0) {
			umap = isl_union_map_free(umap);
		} else if (dim < dim_i) {
			umap = append_range(umap, dim_i - dim);
			dim = dim_i;
		} else if (dim_i < dim) {
			umap_i = append_range(umap_i, dim - dim_i);
		}
		umap = isl_union_map_union(umap, umap_i);
	}

	isl_val_free(v);
	isl_multi_val_free(mv);
	isl_union_map_free(outer);

	return umap;
}

/* Extend the schedule map "outer" with the subtree schedule of "tree".
 *
 * If the root of the tree is a set or a sequence, then we extend
 * the schedule map in subtree_schedule_extend_from_children.
 * Otherwise, we extend the schedule map with the partial schedule
 * corresponding to the root of the tree and then continue with
 * the single child of this root.
 * In the special case of an expansion, the schedule map is "extended"
 * by applying the expansion to the domain of the schedule map.
 */
static __isl_give isl_union_map *subtree_schedule_extend(
	__isl_keep isl_schedule_tree *tree, __isl_take isl_union_map *outer)
{
	isl_multi_union_pw_aff *mupa;
	isl_union_map *umap;
	isl_union_set *domain;
	isl_size n;

	if (!tree)
		return NULL;

	switch (tree->type) {
	case isl_schedule_node_error:
		return isl_union_map_free(outer);
	case isl_schedule_node_extension:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"cannot construct subtree schedule of tree "
			"with extension nodes",
			return isl_union_map_free(outer));
	case isl_schedule_node_context:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
		return subtree_schedule_extend_child(tree, outer);
	case isl_schedule_node_band:
		n = isl_schedule_tree_band_n_member(tree);
		if (n < 0)
			return isl_union_map_free(outer);
		if (n == 0)
			return subtree_schedule_extend_child(tree, outer);
		mupa = isl_schedule_band_get_partial_schedule(tree->band);
		umap = isl_union_map_from_multi_union_pw_aff(mupa);
		outer = isl_union_map_flat_range_product(outer, umap);
		umap = subtree_schedule_extend_child(tree, outer);
		break;
	case isl_schedule_node_domain:
		domain = isl_schedule_tree_domain_get_domain(tree);
		umap = isl_union_map_from_domain(domain);
		outer = isl_union_map_flat_range_product(outer, umap);
		umap = subtree_schedule_extend_child(tree, outer);
		break;
	case isl_schedule_node_expansion:
		umap = isl_schedule_tree_expansion_get_expansion(tree);
		outer = isl_union_map_apply_domain(outer, umap);
		umap = subtree_schedule_extend_child(tree, outer);
		break;
	case isl_schedule_node_filter:
		domain = isl_schedule_tree_filter_get_filter(tree);
		umap = isl_union_map_from_domain(domain);
		outer = isl_union_map_flat_range_product(outer, umap);
		umap = subtree_schedule_extend_child(tree, outer);
		break;
	case isl_schedule_node_leaf:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"leaf node should be handled by caller", return NULL);
	case isl_schedule_node_set:
	case isl_schedule_node_sequence:
		umap = subtree_schedule_extend_from_children(tree, outer);
		break;
	}

	return umap;
}

static __isl_give isl_union_set *initial_domain(
	__isl_keep isl_schedule_tree *tree);

/* Extract a universe domain from the children of the tree root "tree",
 * which is a set or sequence, meaning that its children are filters.
 * In particular, return the union of the universes of the filters.
 */
static __isl_give isl_union_set *initial_domain_from_children(
	__isl_keep isl_schedule_tree *tree)
{
	int i;
	isl_size n;
	isl_space *space;
	isl_union_set *domain;

	n = isl_schedule_tree_n_children(tree);
	if (n < 0)
		return NULL;
	if (n == 0)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"missing children", return NULL);

	space = extract_space_from_filter_child(tree);
	domain = isl_union_set_empty(space);

	for (i = 0; i < n; ++i) {
		isl_schedule_tree *child;
		isl_union_set *domain_i;

		child = isl_schedule_tree_get_child(tree, i);
		domain_i = initial_domain(child);
		domain = isl_union_set_union(domain, domain_i);
		isl_schedule_tree_free(child);
	}

	return domain;
}

/* Extract a universe domain from the tree root "tree".
 * The caller is responsible for making sure that this node
 * would not be skipped by isl_schedule_tree_first_schedule_descendant
 * and that it is not a leaf node.
 */
static __isl_give isl_union_set *initial_domain(
	__isl_keep isl_schedule_tree *tree)
{
	isl_multi_union_pw_aff *mupa;
	isl_union_set *domain;
	isl_union_map *exp;
	isl_size n;

	if (!tree)
		return NULL;

	switch (tree->type) {
	case isl_schedule_node_error:
		return NULL;
	case isl_schedule_node_context:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"context node should be handled by caller",
			return NULL);
	case isl_schedule_node_guard:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"guard node should be handled by caller",
			return NULL);
	case isl_schedule_node_mark:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"mark node should be handled by caller",
			return NULL);
	case isl_schedule_node_extension:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"cannot construct subtree schedule of tree "
			"with extension nodes", return NULL);
	case isl_schedule_node_band:
		n = isl_schedule_tree_band_n_member(tree);
		if (n < 0)
			return NULL;
		if (n == 0)
			isl_die(isl_schedule_tree_get_ctx(tree),
				isl_error_internal,
				"0D band should be handled by caller",
				return NULL);
		mupa = isl_schedule_band_get_partial_schedule(tree->band);
		domain = isl_multi_union_pw_aff_domain(mupa);
		domain = isl_union_set_universe(domain);
		break;
	case isl_schedule_node_domain:
		domain = isl_schedule_tree_domain_get_domain(tree);
		domain = isl_union_set_universe(domain);
		break;
	case isl_schedule_node_expansion:
		exp = isl_schedule_tree_expansion_get_expansion(tree);
		exp = isl_union_map_universe(exp);
		domain = isl_union_map_domain(exp);
		break;
	case isl_schedule_node_filter:
		domain = isl_schedule_tree_filter_get_filter(tree);
		domain = isl_union_set_universe(domain);
		break;
	case isl_schedule_node_leaf:
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
			"leaf node should be handled by caller", return NULL);
	case isl_schedule_node_set:
	case isl_schedule_node_sequence:
		domain = initial_domain_from_children(tree);
		break;
	}

	return domain;
}

/* Return the subtree schedule of a node that contains some schedule
 * information, i.e., a node that would not be skipped by
 * isl_schedule_tree_first_schedule_descendant and that is not a leaf.
 *
 * If the tree contains any expansions, then the returned subtree
 * schedule is formulated in terms of the expanded domains.
 * The tree is not allowed to contain any extension nodes.
 *
 * We start with an initial zero-dimensional subtree schedule based
 * on the domain information in the root node and then extend it
 * based on the schedule information in the root node and its descendants.
 */
__isl_give isl_union_map *isl_schedule_tree_get_subtree_schedule_union_map(
	__isl_keep isl_schedule_tree *tree)
{
	isl_union_set *domain;
	isl_union_map *umap;

	domain = initial_domain(tree);
	umap = isl_union_map_from_domain(domain);
	return subtree_schedule_extend(tree, umap);
}

/* Multiply the partial schedule of the band root node of "tree"
 * with the factors in "mv".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_scale(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv)
{
	if (!tree || !mv)
		goto error;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	tree->band = isl_schedule_band_scale(tree->band, mv);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_multi_val_free(mv);
	return NULL;
}

/* Divide the partial schedule of the band root node of "tree"
 * by the factors in "mv".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_scale_down(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv)
{
	if (!tree || !mv)
		goto error;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	tree->band = isl_schedule_band_scale_down(tree->band, mv);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_multi_val_free(mv);
	return NULL;
}

/* Reduce the partial schedule of the band root node of "tree"
 * modulo the factors in "mv".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_mod(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv)
{
	if (!tree || !mv)
		goto error;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	tree->band = isl_schedule_band_mod(tree->band, mv);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_multi_val_free(mv);
	return NULL;
}

/* Shift the partial schedule of the band root node of "tree" by "shift".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_shift(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_multi_union_pw_aff *shift)
{
	if (!tree || !shift)
		goto error;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	tree->band = isl_schedule_band_shift(tree->band, shift);
	if (!tree->band)
		return isl_schedule_tree_free(tree);

	return tree;
error:
	isl_schedule_tree_free(tree);
	isl_multi_union_pw_aff_free(shift);
	return NULL;
}

/* Given two trees with sequence roots, replace the child at position
 * "pos" of "tree" with the children of "child".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_sequence_splice(
	__isl_take isl_schedule_tree *tree, int pos,
	__isl_take isl_schedule_tree *child)
{
	isl_size n;
	isl_schedule_tree_list *list1, *list2;

	tree = isl_schedule_tree_cow(tree);
	if (!tree || !child)
		goto error;
	if (isl_schedule_tree_get_type(tree) != isl_schedule_node_sequence)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a sequence node", goto error);
	n = isl_schedule_tree_n_children(tree);
	if (n < 0)
		goto error;
	if (pos < 0 || pos >= n)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"position out of bounds", goto error);
	if (isl_schedule_tree_get_type(child) != isl_schedule_node_sequence)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a sequence node", goto error);

	list1 = isl_schedule_tree_list_copy(tree->children);
	list1 = isl_schedule_tree_list_drop(list1, pos, n - pos);
	list2 = isl_schedule_tree_list_copy(tree->children);
	list2 = isl_schedule_tree_list_drop(list2, 0, pos + 1);
	list1 = isl_schedule_tree_list_concat(list1,
				isl_schedule_tree_list_copy(child->children));
	list1 = isl_schedule_tree_list_concat(list1, list2);

	isl_schedule_tree_free(tree);
	isl_schedule_tree_free(child);
	return isl_schedule_tree_from_children(isl_schedule_node_sequence,
						list1);
error:
	isl_schedule_tree_free(tree);
	isl_schedule_tree_free(child);
	return NULL;
}

/* Tile the band root node of "tree" with tile sizes "sizes".
 *
 * We duplicate the band node, change the schedule of one of them
 * to the tile schedule and the other to the point schedule and then
 * attach the point band as a child to the tile band.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_tile(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *sizes)
{
	isl_schedule_tree *child = NULL;

	if (!tree || !sizes)
		goto error;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);

	child = isl_schedule_tree_copy(tree);
	tree = isl_schedule_tree_cow(tree);
	child = isl_schedule_tree_cow(child);
	if (!tree || !child)
		goto error;

	tree->band = isl_schedule_band_tile(tree->band,
					    isl_multi_val_copy(sizes));
	if (!tree->band)
		goto error;
	child->band = isl_schedule_band_point(child->band, tree->band, sizes);
	if (!child->band)
		child = isl_schedule_tree_free(child);

	tree = isl_schedule_tree_replace_child(tree, 0, child);

	return tree;
error:
	isl_schedule_tree_free(child);
	isl_schedule_tree_free(tree);
	isl_multi_val_free(sizes);
	return NULL;
}

/* Given an isolate AST generation option "isolate" for a band of size pos + n,
 * return the corresponding option for a band covering the first "pos"
 * members.
 *
 * The input isolate option is of the form
 *
 *	isolate[[flattened outer bands] -> [pos; n]]
 *
 * The output isolate option is of the form
 *
 *	isolate[[flattened outer bands] -> [pos]]
 */
static __isl_give isl_set *isolate_initial(__isl_keep isl_set *isolate,
	int pos, int n)
{
	isl_id *id;
	isl_map *map;

	isolate = isl_set_copy(isolate);
	id = isl_set_get_tuple_id(isolate);
	map = isl_set_unwrap(isolate);
	map = isl_map_project_out(map, isl_dim_out, pos, n);
	isolate = isl_map_wrap(map);
	isolate = isl_set_set_tuple_id(isolate, id);

	return isolate;
}

/* Given an isolate AST generation option "isolate" for a band of size pos + n,
 * return the corresponding option for a band covering the final "n"
 * members within a band covering the first "pos" members.
 *
 * The input isolate option is of the form
 *
 *	isolate[[flattened outer bands] -> [pos; n]]
 *
 * The output isolate option is of the form
 *
 *	isolate[[flattened outer bands; pos] -> [n]]
 *
 *
 * The range is first split into
 *
 *	isolate[[flattened outer bands] -> [[pos] -> [n]]]
 *
 * and then the first pos members are moved to the domain
 *
 *	isolate[[[flattened outer bands] -> [pos]] -> [n]]
 *
 * after which the domain is flattened to obtain the desired output.
 */
static __isl_give isl_set *isolate_final(__isl_keep isl_set *isolate,
	int pos, int n)
{
	isl_id *id;
	isl_space *space;
	isl_multi_aff *ma1, *ma2;
	isl_map *map;

	isolate = isl_set_copy(isolate);
	id = isl_set_get_tuple_id(isolate);
	map = isl_set_unwrap(isolate);
	space = isl_space_range(isl_map_get_space(map));
	ma1 = isl_multi_aff_project_out_map(isl_space_copy(space),
						   isl_dim_set, pos, n);
	ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos);
	ma1 = isl_multi_aff_range_product(ma1, ma2);
	map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1));
	map = isl_map_uncurry(map);
	map = isl_map_flatten_domain(map);
	isolate = isl_map_wrap(map);
	isolate = isl_set_set_tuple_id(isolate, id);

	return isolate;
}

/* Split the band root node of "tree" into two nested band nodes,
 * one with the first "pos" dimensions and
 * one with the remaining dimensions.
 * The tree is itself positioned at schedule depth "depth".
 *
 * The loop AST generation type options and the isolate option
 * are split over the two band nodes.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_split(
	__isl_take isl_schedule_tree *tree, int pos, int depth)
{
	isl_size n;
	isl_set *isolate, *tree_isolate, *child_isolate;
	isl_schedule_tree *child;

	if (!tree)
		return NULL;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", return isl_schedule_tree_free(tree));

	n = isl_schedule_tree_band_n_member(tree);
	if (n < 0)
		return isl_schedule_tree_free(tree);
	if (pos < 0 || pos > n)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"position out of bounds",
			return isl_schedule_tree_free(tree));

	child = isl_schedule_tree_copy(tree);
	tree = isl_schedule_tree_cow(tree);
	child = isl_schedule_tree_cow(child);
	if (!tree || !child)
		goto error;

	isolate = isl_schedule_tree_band_get_ast_isolate_option(tree, depth);
	tree_isolate = isolate_initial(isolate, pos, n - pos);
	child_isolate = isolate_final(isolate, pos, n - pos);
	child->band = isl_schedule_band_drop(child->band, 0, pos);
	child->band = isl_schedule_band_replace_ast_build_option(child->band,
					isl_set_copy(isolate), child_isolate);
	tree->band = isl_schedule_band_drop(tree->band, pos, n - pos);
	tree->band = isl_schedule_band_replace_ast_build_option(tree->band,
					isl_set_copy(isolate), tree_isolate);
	isl_set_free(isolate);
	if (!child->band || !tree->band)
		goto error;

	tree = isl_schedule_tree_replace_child(tree, 0, child);

	return tree;
error:
	isl_schedule_tree_free(child);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Attach "tree2" at each of the leaves of "tree1".
 *
 * If "tree1" does not have any explicit children, then make "tree2"
 * its single child.  Otherwise, attach "tree2" to the leaves of
 * each of the children of "tree1".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_append_to_leaves(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2)
{
	int i;
	isl_size n;

	n = isl_schedule_tree_n_children(tree1);
	if (n < 0 || !tree2)
		goto error;
	if (n == 0) {
		isl_schedule_tree_list *list;
		list = isl_schedule_tree_list_from_schedule_tree(tree2);
		tree1 = isl_schedule_tree_set_children(tree1, list);
		return tree1;
	}
	for (i = 0; i < n; ++i) {
		isl_schedule_tree *child;

		child = isl_schedule_tree_get_child(tree1, i);
		child = isl_schedule_tree_append_to_leaves(child,
					isl_schedule_tree_copy(tree2));
		tree1 = isl_schedule_tree_replace_child(tree1, i, child);
	}

	isl_schedule_tree_free(tree2);
	return tree1;
error:
	isl_schedule_tree_free(tree1);
	isl_schedule_tree_free(tree2);
	return NULL;
}

/* Reset the user pointer on all identifiers of parameters and tuples
 * in the root of "tree".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_reset_user(
	__isl_take isl_schedule_tree *tree)
{
	if (isl_schedule_tree_is_leaf(tree))
		return tree;

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		return NULL;

	switch (tree->type) {
	case isl_schedule_node_error:
		return isl_schedule_tree_free(tree);
	case isl_schedule_node_band:
		tree->band = isl_schedule_band_reset_user(tree->band);
		if (!tree->band)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_context:
		tree->context = isl_set_reset_user(tree->context);
		if (!tree->context)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_domain:
		tree->domain = isl_union_set_reset_user(tree->domain);
		if (!tree->domain)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_expansion:
		tree->contraction =
			isl_union_pw_multi_aff_reset_user(tree->contraction);
		tree->expansion = isl_union_map_reset_user(tree->expansion);
		if (!tree->contraction || !tree->expansion)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_extension:
		tree->extension = isl_union_map_reset_user(tree->extension);
		if (!tree->extension)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_filter:
		tree->filter = isl_union_set_reset_user(tree->filter);
		if (!tree->filter)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_guard:
		tree->guard = isl_set_reset_user(tree->guard);
		if (!tree->guard)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_leaf:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	}

	return tree;
}

/* Align the parameters of the root of "tree" to those of "space".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_align_params(
	__isl_take isl_schedule_tree *tree, __isl_take isl_space *space)
{
	if (!space)
		goto error;

	if (isl_schedule_tree_is_leaf(tree)) {
		isl_space_free(space);
		return tree;
	}

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	switch (tree->type) {
	case isl_schedule_node_error:
		goto error;
	case isl_schedule_node_band:
		tree->band = isl_schedule_band_align_params(tree->band, space);
		if (!tree->band)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_context:
		tree->context = isl_set_align_params(tree->context, space);
		if (!tree->context)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_domain:
		tree->domain = isl_union_set_align_params(tree->domain, space);
		if (!tree->domain)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_expansion:
		tree->contraction =
			isl_union_pw_multi_aff_align_params(tree->contraction,
							isl_space_copy(space));
		tree->expansion = isl_union_map_align_params(tree->expansion,
								space);
		if (!tree->contraction || !tree->expansion)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_extension:
		tree->extension = isl_union_map_align_params(tree->extension,
								space);
		if (!tree->extension)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_filter:
		tree->filter = isl_union_set_align_params(tree->filter, space);
		if (!tree->filter)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_guard:
		tree->guard = isl_set_align_params(tree->guard, space);
		if (!tree->guard)
			return isl_schedule_tree_free(tree);
		break;
	case isl_schedule_node_leaf:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		isl_space_free(space);
		break;
	}

	return tree;
error:
	isl_space_free(space);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Does "tree" involve the iteration domain?
 * That is, does it need to be modified
 * by isl_schedule_tree_pullback_union_pw_multi_aff?
 */
static int involves_iteration_domain(__isl_keep isl_schedule_tree *tree)
{
	if (!tree)
		return -1;

	switch (tree->type) {
	case isl_schedule_node_error:
		return -1;
	case isl_schedule_node_band:
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_extension:
	case isl_schedule_node_filter:
		return 1;
	case isl_schedule_node_context:
	case isl_schedule_node_leaf:
	case isl_schedule_node_guard:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		return 0;
	}

	isl_die(isl_schedule_tree_get_ctx(tree), isl_error_internal,
		"unhandled case", return -1);
}

/* Compute the pullback of the root node of "tree" by the function
 * represented by "upma".
 * In other words, plug in "upma" in the iteration domains of
 * the root node of "tree".
 * We currently do not handle expansion nodes.
 *
 * We first check if the root node involves any iteration domains.
 * If so, we handle the specific cases.
 */
__isl_give isl_schedule_tree *isl_schedule_tree_pullback_union_pw_multi_aff(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *upma)
{
	int involves;

	if (!tree || !upma)
		goto error;

	involves = involves_iteration_domain(tree);
	if (involves < 0)
		goto error;
	if (!involves) {
		isl_union_pw_multi_aff_free(upma);
		return tree;
	}

	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	if (tree->type == isl_schedule_node_band) {
		tree->band = isl_schedule_band_pullback_union_pw_multi_aff(
							    tree->band, upma);
		if (!tree->band)
			return isl_schedule_tree_free(tree);
	} else if (tree->type == isl_schedule_node_domain) {
		tree->domain =
			isl_union_set_preimage_union_pw_multi_aff(tree->domain,
									upma);
		if (!tree->domain)
			return isl_schedule_tree_free(tree);
	} else if (tree->type == isl_schedule_node_expansion) {
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_unsupported,
			"cannot pullback expansion node", goto error);
	} else if (tree->type == isl_schedule_node_extension) {
		tree->extension =
			isl_union_map_preimage_range_union_pw_multi_aff(
			    tree->extension, upma);
		if (!tree->extension)
			return isl_schedule_tree_free(tree);
	} else if (tree->type == isl_schedule_node_filter) {
		tree->filter =
			isl_union_set_preimage_union_pw_multi_aff(tree->filter,
									upma);
		if (!tree->filter)
			return isl_schedule_tree_free(tree);
	}

	return tree;
error:
	isl_union_pw_multi_aff_free(upma);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Compute the gist of the band tree root with respect to "context".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_gist(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *context)
{
	if (!tree)
		return NULL;
	if (tree->type != isl_schedule_node_band)
		isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
			"not a band node", goto error);
	tree = isl_schedule_tree_cow(tree);
	if (!tree)
		goto error;

	tree->band = isl_schedule_band_gist(tree->band, context);
	if (!tree->band)
		return isl_schedule_tree_free(tree);
	return tree;
error:
	isl_union_set_free(context);
	isl_schedule_tree_free(tree);
	return NULL;
}

/* Are any members in "band" marked coincident?
 */
static isl_bool any_coincident(__isl_keep isl_schedule_band *band)
{
	int i;
	isl_size n;

	n = isl_schedule_band_n_member(band);
	if (n < 0)
		return isl_bool_error;
	for (i = 0; i < n; ++i) {
		isl_bool coincident;

		coincident = isl_schedule_band_member_get_coincident(band, i);
		if (coincident < 0 || coincident)
			return coincident;
	}

	return isl_bool_false;
}

/* AutoSA Extended */
/* Is space_time property existed or are any members in "band" marked space/time?
 */
static isl_bool any_space_time(__isl_keep isl_schedule_band *band)
{
  int i;
  isl_size n;

  n = isl_schedule_band_n_member(band);
  if (n < 0)
    return isl_bool_error;
  for (i = 0; i < n; ++i) {
    enum autosa_loop_type space_time;
    
    space_time = isl_schedule_band_member_get_space_time(band, i);
    if (space_time == autosa_loop_time || space_time == autosa_loop_space)
      return isl_bool_true;
  }

  return isl_bool_false;
}

/* Is pe_opt property existed or are any members in "band" marked pe_opt?
 */
static isl_bool any_pe_opt(__isl_keep isl_schedule_band *band)
{
  int i;
  isl_size n;

  n = isl_schedule_band_n_member(band);
  if (n < 0)
    return isl_bool_error;
  for (i = 0; i < n; ++i) {
    enum autosa_loop_type pe_opt;
    
    pe_opt = isl_schedule_band_member_get_pe_opt(band, i);
    if (pe_opt == autosa_loop_latency || pe_opt == autosa_loop_simd || 
				pe_opt == autosa_loop_array_part)
      return isl_bool_true;
  }
  
  return isl_bool_false;
}

/* Is sched_pos property existed or are any numbers in "band" marked sched_pos? 
 */
static isl_bool any_sched_pos(__isl_keep isl_schedule_band *band)
{
	int i;
	isl_size n;

	n = isl_schedule_band_n_member(band);
	if (n < 0)
		return isl_bool_error;
	for (i = 0; i < n; ++i) {
		int sched_pos;

		sched_pos = isl_schedule_band_member_get_sched_pos(band, i);
		if (sched_pos >= 0 && sched_pos < n)
			return isl_bool_true;
	}

	return isl_bool_false;
}
/* AutoSA Extended */

/* Print the band node "band" to "p".
 *
 * The permutable and coincident properties are only printed if they
 * are different from the defaults.
 * The coincident property is always printed in YAML flow style.
 */
static __isl_give isl_printer *print_tree_band(__isl_take isl_printer *p,
	__isl_keep isl_schedule_band *band)
{
	isl_union_set *options;
	isl_bool empty;
	isl_bool coincident;
	/* AutoSA Extended */
	isl_bool pe_opt;
	isl_bool space_time;
	isl_bool sched_pos;
	/* AutoSA Extended */

	p = isl_printer_print_str(p, "schedule");
	p = isl_printer_yaml_next(p);
	p = isl_printer_print_str(p, "\"");
	p = isl_printer_print_multi_union_pw_aff(p, band->mupa);
	p = isl_printer_print_str(p, "\"");
	if (isl_schedule_band_get_permutable(band)) {
		p = isl_printer_yaml_next(p);
		p = isl_printer_print_str(p, "permutable");
		p = isl_printer_yaml_next(p);
		p = isl_printer_print_int(p, 1);
	}
	coincident = any_coincident(band);
	if (coincident < 0)
		return isl_printer_free(p);
	if (coincident) {
		int i;
		isl_size n;
		int style;

		p = isl_printer_yaml_next(p);
		p = isl_printer_print_str(p, "coincident");
		p = isl_printer_yaml_next(p);
		style = isl_printer_get_yaml_style(p);
		p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_FLOW);
		p = isl_printer_yaml_start_sequence(p);
		n = isl_schedule_band_n_member(band);
		if (n < 0)
			return isl_printer_free(p);
		for (i = 0; i < n; ++i) {
			p = isl_printer_print_int(p,
			    isl_schedule_band_member_get_coincident(band, i));
			p = isl_printer_yaml_next(p);
		}
		p = isl_printer_yaml_end_sequence(p);
		p = isl_printer_set_yaml_style(p, style);
	}
	/* AutoSA Extended */
  space_time = any_space_time(band);
  if (space_time < 0)
    return isl_printer_free(p);
  if (space_time) {
    int i;
    isl_size n;
    int style;

    p = isl_printer_yaml_next(p);
    p = isl_printer_print_str(p, "space_time");
    p = isl_printer_yaml_next(p);
    style = isl_printer_get_yaml_style(p);
    p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_FLOW);
    p = isl_printer_yaml_start_sequence(p);
    n = isl_schedule_band_n_member(band);
    if (n < 0)
      return isl_printer_free(p);
    for (i = 0; i < n; ++i) {
      switch(isl_schedule_band_member_get_space_time(band, i)) {
        case autosa_loop_default:
          p = isl_printer_print_str(p, "default");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_error:
          p = isl_printer_print_str(p, "error");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_time:
          p = isl_printer_print_str(p, "time");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_space:
          p = isl_printer_print_str(p, "space");
          p = isl_printer_yaml_next(p);
          break;
        default:
          p = isl_printer_print_str(p, "unknown");
          p = isl_printer_yaml_next(p);
          break;
      }
    }
    p = isl_printer_yaml_end_sequence(p);
    p = isl_printer_set_yaml_style(p, style);
  }
  pe_opt = any_pe_opt(band);
  if (pe_opt < 0)
    return isl_printer_free(p);
  if (pe_opt) {
    int i;
    isl_size n;
    int style;

    p = isl_printer_yaml_next(p);
    p = isl_printer_print_str(p, "pe_opt");
    p = isl_printer_yaml_next(p);
    style = isl_printer_get_yaml_style(p);
    p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_FLOW);
    p = isl_printer_yaml_start_sequence(p);
    n = isl_schedule_band_n_member(band);
    if (n < 0)
      return isl_printer_free(p);
    for (i = 0; i < n; ++i) {
      switch(isl_schedule_band_member_get_pe_opt(band, i)) {
        case autosa_loop_default:
          p = isl_printer_print_str(p, "default");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_error:
          p = isl_printer_print_str(p, "error");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_latency:
          p = isl_printer_print_str(p, "latency");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_simd:
          p = isl_printer_print_str(p, "simd");
          p = isl_printer_yaml_next(p);
          break;
        case autosa_loop_array_part:
          p = isl_printer_print_str(p, "array_part");
          p = isl_printer_yaml_next(p);
          break;
        default:
          p = isl_printer_print_str(p, "unknown");
          p = isl_printer_yaml_next(p);
          break;
      }
    }
    p = isl_printer_yaml_end_sequence(p);
    p = isl_printer_set_yaml_style(p, style);
  }
	sched_pos = any_sched_pos(band);
	if (sched_pos < 0)
		return isl_printer_free(p);
	if (sched_pos)		 {
		int i;
		isl_size n;
		int style;

		p = isl_printer_yaml_next(p);
		p = isl_printer_print_str(p, "sched_pos");
		p = isl_printer_yaml_next(p);
		style = isl_printer_get_yaml_style(p);
		p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_FLOW);
		p = isl_printer_yaml_start_sequence(p);
		n = isl_schedule_band_n_member(band);
		if (n < 0)
			return isl_printer_free(p);
		for (i = 0; i < n; ++i) {
			p = isl_printer_print_int(p, isl_schedule_band_member_get_sched_pos(band, i));
			p = isl_printer_yaml_next(p);
		}
		p = isl_printer_yaml_end_sequence(p);
		p = isl_printer_set_yaml_style(p, style);
	}
	/* AutoSA Extended */

	options = isl_schedule_band_get_ast_build_options(band);
	empty = isl_union_set_is_empty(options);
	if (empty < 0)
		p = isl_printer_free(p);
	if (!empty) {
		p = isl_printer_yaml_next(p);
		p = isl_printer_print_str(p, "options");
		p = isl_printer_yaml_next(p);
		p = isl_printer_print_str(p, "\"");
		p = isl_printer_print_union_set(p, options);
		p = isl_printer_print_str(p, "\"");
	}
	isl_union_set_free(options);

	return p;
}

#undef BASE
#define BASE str
#define isl_str const char
#include "print_yaml_field_templ.c"

#undef BASE
#define BASE set
#include "print_yaml_field_templ.c"

#undef BASE
#define BASE union_set
#include "print_yaml_field_templ.c"

#undef BASE
#define BASE union_map
#include "print_yaml_field_templ.c"

#undef BASE
#define BASE union_pw_multi_aff
#include "print_yaml_field_templ.c"

/* Print "tree" to "p".
 *
 * If "n_ancestor" is non-negative, then "child_pos" contains the child
 * positions of a descendant of the current node that should be marked
 * (by the comment "YOU ARE HERE").  In particular, if "n_ancestor"
 * is zero, then the current node should be marked.
 * The marking is only printed in YAML block format.
 *
 * Implicit leaf nodes are not printed, except if they correspond
 * to the node that should be marked.
 */
__isl_give isl_printer *isl_printer_print_schedule_tree_mark(
	__isl_take isl_printer *p, __isl_keep isl_schedule_tree *tree,
	int n_ancestor, int *child_pos)
{
	int i;
	isl_size n;
	int sequence = 0;
	int block;

	block = isl_printer_get_yaml_style(p) == ISL_YAML_STYLE_BLOCK;

	p = isl_printer_yaml_start_mapping(p);
	if (n_ancestor == 0 && block) {
		p = isl_printer_print_str(p, "# YOU ARE HERE");
		p = isl_printer_end_line(p);
		p = isl_printer_start_line(p);
	}
	switch (tree->type) {
	case isl_schedule_node_error:
		p = isl_printer_print_str(p, "ERROR");
		p = isl_printer_yaml_next(p);
		break;
	case isl_schedule_node_leaf:
		p = isl_printer_print_str(p, "leaf");
		p = isl_printer_yaml_next(p);
		break;
	case isl_schedule_node_sequence:
		p = isl_printer_print_str(p, "sequence");
		p = isl_printer_yaml_next(p);
		sequence = 1;
		break;
	case isl_schedule_node_set:
		p = isl_printer_print_str(p, "set");
		p = isl_printer_yaml_next(p);
		sequence = 1;
		break;
	case isl_schedule_node_context:
		p = print_yaml_field_set(p, "context", tree->context);
		break;
	case isl_schedule_node_domain:
		p = print_yaml_field_union_set(p, "domain", tree->domain);
		break;
	case isl_schedule_node_expansion:
		p = print_yaml_field_union_pw_multi_aff(p, "contraction",
							tree->contraction);
		p = print_yaml_field_union_map(p, "expansion", tree->expansion);
		break;
	case isl_schedule_node_extension:
		p = print_yaml_field_union_map(p, "extension", tree->extension);
		break;
	case isl_schedule_node_filter:
		p = print_yaml_field_union_set(p, "filter", tree->filter);
		break;
	case isl_schedule_node_guard:
		p = print_yaml_field_set(p, "guard", tree->guard);
		break;
	case isl_schedule_node_mark:
		p = print_yaml_field_str(p, "mark",
					isl_id_get_name(tree->mark));
		break;
	case isl_schedule_node_band:
		p = print_tree_band(p, tree->band);
		p = isl_printer_yaml_next(p);
		break;
	}

	n = isl_schedule_tree_n_children(tree);
	if (n < 0)
		return isl_printer_free(p);
	if (n == 0) {
		if (n_ancestor > 0 && block) {
			isl_schedule_tree *leaf;

			p = isl_printer_print_str(p, "child");
			p = isl_printer_yaml_next(p);
			leaf = isl_schedule_tree_leaf(isl_printer_get_ctx(p));
			p = isl_printer_print_schedule_tree_mark(p,
					leaf, 0, NULL);
			isl_schedule_tree_free(leaf);
			p = isl_printer_yaml_next(p);
		}
		return isl_printer_yaml_end_mapping(p);
	}

	if (sequence) {
		p = isl_printer_yaml_start_sequence(p);
	} else {
		p = isl_printer_print_str(p, "child");
		p = isl_printer_yaml_next(p);
	}

	for (i = 0; i < n; ++i) {
		isl_schedule_tree *t;

		t = isl_schedule_tree_get_child(tree, i);
		if (n_ancestor > 0 && child_pos[0] == i)
			p = isl_printer_print_schedule_tree_mark(p, t,
						n_ancestor - 1, child_pos + 1);
		else
			p = isl_printer_print_schedule_tree_mark(p, t,
						-1, NULL);
		isl_schedule_tree_free(t);

		p = isl_printer_yaml_next(p);
	}

	if (sequence)
		p = isl_printer_yaml_end_sequence(p);
	p = isl_printer_yaml_end_mapping(p);

	return p;
}

/* Print "tree" to "p".
 */
__isl_give isl_printer *isl_printer_print_schedule_tree(
	__isl_take isl_printer *p, __isl_keep isl_schedule_tree *tree)
{
	return isl_printer_print_schedule_tree_mark(p, tree, -1, NULL);
}

void isl_schedule_tree_dump(__isl_keep isl_schedule_tree *tree)
{
	isl_ctx *ctx;
	isl_printer *printer;

	if (!tree)
		return;

	ctx = isl_schedule_tree_get_ctx(tree);
	printer = isl_printer_to_file(ctx, stderr);
	printer = isl_printer_set_yaml_style(printer, ISL_YAML_STYLE_BLOCK);
	printer = isl_printer_print_schedule_tree(printer, tree);

	isl_printer_free(printer);
}

/* AutoSA Extended */
/* Return the space_time property of the band member at position 
 * "pos" of the band tree root.
 */
enum autosa_loop_type isl_schedule_tree_band_member_get_space_time(
  __isl_keep isl_schedule_tree *tree, int pos)
{
  if (!tree)
    return autosa_loop_error;
  
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return autosa_loop_error);

  return isl_schedule_band_member_get_space_time(tree->band, pos);
}

/* Set the space_time property of the band member accoding to "loop_type".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_space_time(
  __isl_take isl_schedule_tree *tree, int pos, enum autosa_loop_type loop_type)
{
  if (!tree)
    return NULL;
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return isl_schedule_tree_free(tree));
  if (isl_schedule_tree_band_member_get_space_time(tree, pos) == 
      loop_type)
    return tree;
  tree = isl_schedule_tree_cow(tree);
  if (!tree)
    return NULL;

  tree->band = isl_schedule_band_member_set_space_time(tree->band, pos,
      loop_type);
  if (!tree->band)
    return isl_schedule_tree_free(tree);
  
  return tree;
}

/* Return the pe_opt property of the band member at position 
 * "pos" of the band tree root.
 */
enum autosa_loop_type isl_schedule_tree_band_member_get_pe_opt(
  __isl_keep isl_schedule_tree *tree, int pos)
{
  if (!tree)
    return isl_size_error;
  
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return autosa_loop_error);

  return isl_schedule_band_member_get_pe_opt(tree->band, pos);
}

/* Set the space_time property of the band member accoding to "loop_type".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_pe_opt(
  __isl_take isl_schedule_tree *tree, int pos, enum autosa_loop_type loop_type)
{
  if (!tree)
    return NULL;
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return isl_schedule_tree_free(tree));
  if (isl_schedule_tree_band_member_get_pe_opt(tree, pos) == 
      loop_type)
    return tree;
  tree = isl_schedule_tree_cow(tree);
  if (!tree)
    return NULL;

  tree->band = isl_schedule_band_member_set_pe_opt(tree->band, pos,
      loop_type);
  if (!tree->band)
    return isl_schedule_tree_free(tree);
  
  return tree;
}

/* Return the sched_pos property of the band member at position 
 * "pos" of the band tree root.
 */
int isl_schedule_tree_band_member_get_sched_pos(
  __isl_keep isl_schedule_tree *tree, int pos)
{
  if (!tree)
    return isl_size_error;
  
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return autosa_loop_error);

  return isl_schedule_band_member_get_sched_pos(tree->band, pos);
}

/* Set the sched_pos property of the band member accoding to "sched_pos".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_sched_pos(
  __isl_take isl_schedule_tree *tree, int pos, int sched_pos)
{
  if (!tree)
    return NULL;
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return isl_schedule_tree_free(tree));
  if (isl_schedule_tree_band_member_get_sched_pos(tree, pos) == 
      sched_pos)
    return tree;
  tree = isl_schedule_tree_cow(tree);
  if (!tree)
    return NULL;

  tree->band = isl_schedule_band_member_set_sched_pos(tree->band, pos,
      sched_pos);
  if (!tree->band)
    return isl_schedule_tree_free(tree);
  
  return tree;
}

/* Return the iter property of the band member at position 
 * "pos" of the band tree root.
 */
void *isl_schedule_tree_band_member_get_iter(
  __isl_keep isl_schedule_tree *tree, int pos)
{
  if (!tree)
    return NULL;
  
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return NULL);

  return isl_schedule_band_member_get_iter(tree->band, pos);
}

/* Set the iter property of the band member accoding to "iter".
 */
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_iter(
  __isl_take isl_schedule_tree *tree, int pos, void *iter)
{
  if (!tree)
    return NULL;
  if (tree->type != isl_schedule_node_band)
    isl_die(isl_schedule_tree_get_ctx(tree), isl_error_invalid,
        "not a band node", return isl_schedule_tree_free(tree));
  if (isl_schedule_tree_band_member_get_iter(tree, pos) == 
      iter)
    return tree;
  tree = isl_schedule_tree_cow(tree);
  if (!tree)
    return NULL;

  tree->band = isl_schedule_band_member_set_iter(tree->band, pos,
      iter);
  if (!tree->band)
    return isl_schedule_tree_free(tree);
  
  return tree;
}
/* AutoSA Extended */

================================================
FILE: autosa_scripts/ppcg_changes/isl/isl_schedule_tree.h
================================================
#ifndef ISL_SCHEDLUE_TREE_H
#define ISL_SCHEDLUE_TREE_H

#include <isl_schedule_band.h>
#include <isl/schedule.h>
#include <isl/set.h>
#include <isl/union_set.h>

struct isl_schedule_tree;
typedef struct isl_schedule_tree isl_schedule_tree;

ISL_DECLARE_LIST(schedule_tree)

/* A schedule (sub)tree.
 *
 * The leaves of a tree are not explicitly represented inside
 * the isl_schedule_tree, except when the tree consists of only a leaf.
 *
 * The "band" field is valid when type is isl_schedule_node_band.
 * The "context" field is valid when type is isl_schedule_node_context
 * and represents constraints on the flat product of the outer band nodes,
 * possibly introducing additional parameters.
 * The "domain" field is valid when type is isl_schedule_node_domain
 * and introduces the statement instances scheduled by the tree.
 *
 * The "contraction" and "expansion" fields are valid when type
 * is isl_schedule_node_expansion.
 * "expansion" expands the reaching domain elements to one or more
 * domain elements for the subtree.
 * "contraction" maps these elements back to the corresponding
 * reaching domain element.  It does not involve any domain constraints.
 *
 * The "extension" field is valid when the is isl_schedule_node_extension
 * maps outer schedule dimensions (the flat product of the outer band nodes)
 * to additional iteration domains.
 *
 * The "filter" field is valid when type is isl_schedule_node_filter
 * and represents the statement instances selected by the node.
 *
 * The "guard" field is valid when type is isl_schedule_node_guard
 * and represents constraints on the flat product of the outer band nodes
 * that need to be enforced by the outer nodes in the generated AST.
 *
 * The "mark" field is valid when type is isl_schedule_node_mark and
 * identifies the mark.
 *
 * The "children" field is valid for all types except
 * isl_schedule_node_leaf.  This field is NULL if there are
 * no children (except for the implicit leaves).
 *
 * anchored is set if the node or any of its descendants depends
 * on its position in the schedule tree.
 */
struct isl_schedule_tree {
	int ref;
	isl_ctx *ctx;
	int anchored;
	enum isl_schedule_node_type type;
	union {
		isl_schedule_band *band;
		isl_set *context;
		isl_union_set *domain;
		struct {
			isl_union_pw_multi_aff *contraction;
			isl_union_map *expansion;
		};
		isl_union_map *extension;
		isl_union_set *filter;
		isl_set *guard;
		isl_id *mark;
	};
	isl_schedule_tree_list *children;
};

isl_ctx *isl_schedule_tree_get_ctx(__isl_keep isl_schedule_tree *tree);
enum isl_schedule_node_type isl_schedule_tree_get_type(
	__isl_keep isl_schedule_tree *tree);

__isl_give isl_schedule_tree *isl_schedule_tree_leaf(isl_ctx *ctx);
int isl_schedule_tree_is_leaf(__isl_keep isl_schedule_tree *tree);

isl_bool isl_schedule_tree_plain_is_equal(__isl_keep isl_schedule_tree *tree1,
	__isl_keep isl_schedule_tree *tree2);

__isl_give isl_schedule_tree *isl_schedule_tree_copy(
	__isl_keep isl_schedule_tree *tree);
__isl_null isl_schedule_tree *isl_schedule_tree_free(
	__isl_take isl_schedule_tree *tree);

__isl_give isl_schedule_tree *isl_schedule_tree_from_band(
	__isl_take isl_schedule_band *band);
__isl_give isl_schedule_tree *isl_schedule_tree_from_context(
	__isl_take isl_set *context);
__isl_give isl_schedule_tree *isl_schedule_tree_from_domain(
	__isl_take isl_union_set *domain);
__isl_give isl_schedule_tree *isl_schedule_tree_from_expansion(
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion);
__isl_give isl_schedule_tree *isl_schedule_tree_from_extension(
	__isl_take isl_union_map *extension);
__isl_give isl_schedule_tree *isl_schedule_tree_from_filter(
	__isl_take isl_union_set *filter);
__isl_give isl_schedule_tree *isl_schedule_tree_from_guard(
	__isl_take isl_set *guard);
__isl_give isl_schedule_tree *isl_schedule_tree_from_children(
	enum isl_schedule_node_type type,
	__isl_take isl_schedule_tree_list *list);
__isl_give isl_schedule_tree *isl_schedule_tree_from_pair(
	enum isl_schedule_node_type type, __isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2);
__isl_give isl_schedule_tree *isl_schedule_tree_sequence_pair(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2);
__isl_give isl_schedule_tree *isl_schedule_tree_set_pair(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2);

isl_bool isl_schedule_tree_is_subtree_anchored(
	__isl_keep isl_schedule_tree *tree);

__isl_give isl_space *isl_schedule_tree_band_get_space(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_band_intersect_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain);
__isl_give isl_multi_union_pw_aff *isl_schedule_tree_band_get_partial_schedule(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_partial_schedule(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_multi_union_pw_aff *schedule);
enum isl_ast_loop_type isl_schedule_tree_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_ast_loop_type(
	__isl_take isl_schedule_tree *tree, int pos,
	enum isl_ast_loop_type type);
enum isl_ast_loop_type isl_schedule_tree_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *
isl_schedule_tree_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_tree *tree, int pos,
	enum isl_ast_loop_type type);
__isl_give isl_union_set *isl_schedule_tree_band_get_ast_build_options(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_ast_build_options(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *options);
__isl_give isl_set *isl_schedule_tree_band_get_ast_isolate_option(
	__isl_keep isl_schedule_tree *tree, int depth);
__isl_give isl_set *isl_schedule_tree_context_get_context(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_union_set *isl_schedule_tree_domain_get_domain(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_domain_set_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain);
__isl_give isl_union_pw_multi_aff *isl_schedule_tree_expansion_get_contraction(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_union_map *isl_schedule_tree_expansion_get_expansion(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *
isl_schedule_tree_expansion_set_contraction_and_expansion(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion);
__isl_give isl_union_map *isl_schedule_tree_extension_get_extension(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_extension_set_extension(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_map *extension);
__isl_give isl_union_set *isl_schedule_tree_filter_get_filter(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_filter_set_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter);
__isl_give isl_set *isl_schedule_tree_guard_get_guard(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_id *isl_schedule_tree_mark_get_id(
	__isl_keep isl_schedule_tree *tree);

__isl_give isl_schedule_tree *isl_schedule_tree_first_schedule_descendant(
	__isl_take isl_schedule_tree *tree, __isl_keep isl_schedule_tree *leaf);
__isl_give isl_union_map *isl_schedule_tree_get_subtree_schedule_union_map(
	__isl_keep isl_schedule_tree *tree);

isl_size isl_schedule_tree_band_n_member(__isl_keep isl_schedule_tree *tree);

isl_bool isl_schedule_tree_band_member_get_coincident(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_coincident(
	__isl_take isl_schedule_tree *tree, int pos, int coincident);
isl_bool isl_schedule_tree_band_get_permutable(
	__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_band_set_permutable(
	__isl_take isl_schedule_tree *tree, int permutable);

int isl_schedule_tree_has_children(__isl_keep isl_schedule_tree *tree);
isl_size isl_schedule_tree_n_children(__isl_keep isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_get_child(
	__isl_keep isl_schedule_tree *tree, int pos);

__isl_give isl_schedule_tree *isl_schedule_tree_insert_band(
	__isl_take isl_schedule_tree *tree, __isl_take isl_schedule_band *band);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_context(
	__isl_take isl_schedule_tree *tree, __isl_take isl_set *context);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_domain(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *domain);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_expansion(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_union_map *expansion);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_extension(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_map *extension);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter);
__isl_give isl_schedule_tree *isl_schedule_tree_children_insert_filter(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *filter);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_guard(
	__isl_take isl_schedule_tree *tree, __isl_take isl_set *guard);
__isl_give isl_schedule_tree *isl_schedule_tree_insert_mark(
	__isl_take isl_schedule_tree *tree, __isl_take isl_id *mark);

__isl_give isl_schedule_tree *isl_schedule_tree_append_to_leaves(
	__isl_take isl_schedule_tree *tree1,
	__isl_take isl_schedule_tree *tree2);

__isl_give isl_schedule_tree *isl_schedule_tree_band_scale(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_tree *isl_schedule_tree_band_scale_down(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_tree *isl_schedule_tree_band_mod(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *mv);
__isl_give isl_schedule_tree *isl_schedule_tree_band_tile(
	__isl_take isl_schedule_tree *tree, __isl_take isl_multi_val *sizes);
__isl_give isl_schedule_tree *isl_schedule_tree_band_shift(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_multi_union_pw_aff *shift);
__isl_give isl_schedule_tree *isl_schedule_tree_band_split(
	__isl_take isl_schedule_tree *tree, int pos, int depth);
__isl_give isl_schedule_tree *isl_schedule_tree_band_gist(
	__isl_take isl_schedule_tree *tree, __isl_take isl_union_set *context);

__isl_give isl_schedule_tree *isl_schedule_tree_child(
	__isl_take isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_reset_children(
	__isl_take isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_drop_child(
	__isl_take isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_replace_child(
	__isl_take isl_schedule_tree *tree, int pos,
	__isl_take isl_schedule_tree *new_child);
__isl_give isl_schedule_tree *isl_schedule_tree_sequence_splice(
	__isl_take isl_schedule_tree *tree, int pos,
	__isl_take isl_schedule_tree *child);

__isl_give isl_schedule_tree *isl_schedule_tree_reset_user(
	__isl_take isl_schedule_tree *tree);
__isl_give isl_schedule_tree *isl_schedule_tree_align_params(
	__isl_take isl_schedule_tree *tree, __isl_take isl_space *space);
__isl_give isl_schedule_tree *isl_schedule_tree_pullback_union_pw_multi_aff(
	__isl_take isl_schedule_tree *tree,
	__isl_take isl_union_pw_multi_aff *upma);

__isl_give isl_printer *isl_printer_print_schedule_tree(
	__isl_take isl_printer *p, __isl_keep isl_schedule_tree *tree);
__isl_give isl_printer *isl_printer_print_schedule_tree_mark(
	__isl_take isl_printer *p, __isl_keep isl_schedule_tree *tree,
	int n_ancestor, int *child_pos);

/* AutoSA Extended */
__isl_take isl_schedule_tree *isl_schedule_tree_dup(
	__isl_keep isl_schedule_tree *tree);
enum autosa_loop_type isl_schedule_tree_band_member_get_space_time(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_space_time(
	__isl_take isl_schedule_tree *tree, int pos, enum autosa_loop_type loop_type);
enum autosa_loop_type isl_schedule_tree_band_member_get_pe_opt(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_pe_opt(
	__isl_take isl_schedule_tree *tree, int pos, enum autosa_loop_type loop_type);
int isl_schedule_tree_band_member_get_sched_pos(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_sched_pos(
	__isl_take isl_schedule_tree *tree, int pos, int sched_pos);
void *isl_schedule_tree_band_member_get_iter(
	__isl_keep isl_schedule_tree *tree, int pos);
__isl_give isl_schedule_tree *isl_schedule_tree_band_member_set_iter(
	__isl_take isl_schedule_tree *tree, int pos, void *iter);
/* AutoSA Extended */

#endif


================================================
FILE: autosa_scripts/ppcg_changes/isl/schedule.h
================================================
#ifndef ISL_SCHEDULE_H
#define ISL_SCHEDULE_H

#include <isl/union_set_type.h>
#include <isl/union_map_type.h>
#include <isl/schedule_type.h>
#include <isl/aff_type.h>
#include <isl/space_type.h>
#include <isl/set_type.h>
#include <isl/list.h>
#include <isl/printer_type.h>

#if defined(__cplusplus)
extern "C" {
#endif

struct __isl_export isl_schedule_constraints;
typedef struct isl_schedule_constraints isl_schedule_constraints;

isl_stat isl_options_set_schedule_max_coefficient(isl_ctx *ctx, int val);
int isl_options_get_schedule_max_coefficient(isl_ctx *ctx);

isl_stat isl_options_set_schedule_max_constant_term(isl_ctx *ctx, int val);
int isl_options_get_schedule_max_constant_term(isl_ctx *ctx);

isl_stat isl_options_set_schedule_maximize_band_depth(isl_ctx *ctx, int val);
int isl_options_get_schedule_maximize_band_depth(isl_ctx *ctx);

isl_stat isl_options_set_schedule_maximize_coincidence(isl_ctx *ctx, int val);
int isl_options_get_schedule_maximize_coincidence(isl_ctx *ctx);

isl_stat isl_options_set_schedule_outer_coincidence(isl_ctx *ctx, int val);
int isl_options_get_schedule_outer_coincidence(isl_ctx *ctx);

isl_stat isl_options_set_schedule_split_scaled(isl_ctx *ctx, int val);
int isl_options_get_schedule_split_scaled(isl_ctx *ctx);

isl_stat isl_options_set_schedule_treat_coalescing(isl_ctx *ctx, int val);
int isl_options_get_schedule_treat_coalescing(isl_ctx *ctx);

isl_stat isl_options_set_schedule_separate_components(isl_ctx *ctx, int val);
int isl_options_get_schedule_separate_components(isl_ctx *ctx);

isl_stat isl_options_set_schedule_serialize_sccs(isl_ctx *ctx, int val);
int isl_options_get_schedule_serialize_sccs(isl_ctx *ctx);

isl_stat isl_options_set_schedule_whole_component(isl_ctx *ctx, int val);
int isl_options_get_schedule_whole_component(isl_ctx *ctx);

isl_stat isl_options_set_schedule_carry_self_first(isl_ctx *ctx, int val);
int isl_options_get_schedule_carry_self_first(isl_ctx *ctx);

__isl_give isl_schedule_constraints *isl_schedule_constraints_copy(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_schedule_constraints *isl_schedule_constraints_on_domain(
	__isl_take isl_union_set *domain);
__isl_export
__isl_give isl_schedule_constraints *isl_schedule_constraints_set_context(
	__isl_take isl_schedule_constraints *sc, __isl_take isl_set *context);
__isl_export
__isl_give isl_schedule_constraints *isl_schedule_constraints_set_validity(
	__isl_take isl_schedule_constraints *sc,
	__isl_take isl_union_map *validity);
__isl_export
__isl_give isl_schedule_constraints *isl_schedule_constraints_set_coincidence(
	__isl_take isl_schedule_constraints *sc,
	__isl_take isl_union_map *coincidence);
__isl_export
__isl_give isl_schedule_constraints *isl_schedule_constraints_set_proximity(
	__isl_take isl_schedule_constraints *sc,
	__isl_take isl_union_map *proximity);
__isl_export
__isl_give isl_schedule_constraints *
isl_schedule_constraints_set_conditional_validity(
	__isl_take isl_schedule_constraints *sc,
	__isl_take isl_union_map *condition,
	__isl_take isl_union_map *validity);
__isl_null isl_schedule_constraints *isl_schedule_constraints_free(
	__isl_take isl_schedule_constraints *sc);

isl_ctx *isl_schedule_constraints_get_ctx(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_set *isl_schedule_constraints_get_domain(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_set *isl_schedule_constraints_get_context(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_map *isl_schedule_constraints_get_validity(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_map *isl_schedule_constraints_get_coincidence(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_map *isl_schedule_constraints_get_proximity(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_map *isl_schedule_constraints_get_conditional_validity(
	__isl_keep isl_schedule_constraints *sc);
__isl_export
__isl_give isl_union_map *
isl_schedule_constraints_get_conditional_validity_condition(
	__isl_keep isl_schedule_constraints *sc);

__isl_give isl_schedule_constraints *isl_schedule_constraints_apply(
	__isl_take isl_schedule_constraints *sc,
	__isl_take isl_union_map *umap);

__isl_constructor
__isl_give isl_schedule_constraints *isl_schedule_constraints_read_from_str(
	isl_ctx *ctx, const char *str);
__isl_give isl_schedule_constraints *isl_schedule_constraints_read_from_file(
	isl_ctx *ctx, FILE *input);
__isl_give isl_printer *isl_printer_print_schedule_constraints(
	__isl_take isl_printer *p, __isl_keep isl_schedule_constraints *sc);
void isl_schedule_constraints_dump(__isl_keep isl_schedule_constraints *sc);
__isl_give char *isl_schedule_constraints_to_str(
	__isl_keep isl_schedule_constraints *sc);

__isl_export
__isl_give isl_schedule *isl_schedule_constraints_compute_schedule(
	__isl_take isl_schedule_constraints *sc);

__isl_give isl_schedule *isl_union_set_compute_schedule(
	__isl_take isl_union_set *domain,
	__isl_take isl_union_map *validity,
	__isl_take isl_union_map *proximity);

__isl_give isl_schedule *isl_schedule_empty(__isl_take isl_space *space);
__isl_export
__isl_give isl_schedule *isl_schedule_from_domain(
	__isl_take isl_union_set *domain);
__isl_give isl_schedule *isl_schedule_copy(__isl_keep isl_schedule *sched);
__isl_null isl_schedule *isl_schedule_free(__isl_take isl_schedule *sched);
__isl_export
__isl_give isl_union_map *isl_schedule_get_map(__isl_keep isl_schedule *sched);

isl_ctx *isl_schedule_get_ctx(__isl_keep isl_schedule *sched);
isl_bool isl_schedule_plain_is_equal(__isl_keep isl_schedule *schedule1,
	__isl_keep isl_schedule *schedule2);

__isl_export
__isl_give isl_schedule_node *isl_schedule_get_root(
	__isl_keep isl_schedule *schedule);
__isl_give isl_union_set *isl_schedule_get_domain(
	__isl_keep isl_schedule *schedule);

isl_stat isl_schedule_foreach_schedule_node_top_down(
	__isl_keep isl_schedule *sched,
	isl_bool (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user);
__isl_give isl_schedule *isl_schedule_map_schedule_node_bottom_up(
	__isl_take isl_schedule *schedule,
	__isl_give isl_schedule_node *(*fn)(
		__isl_take isl_schedule_node *node, void *user), void *user);

__isl_give isl_schedule *isl_schedule_insert_context(
	__isl_take isl_schedule *schedule, __isl_take isl_set *context);
__isl_give isl_schedule *isl_schedule_insert_partial_schedule(
	__isl_take isl_schedule *schedule,
	__isl_take isl_multi_union_pw_aff *partial);
__isl_give isl_schedule *isl_schedule_insert_guard(
	__isl_take isl_schedule *schedule, __isl_take isl_set *guard);
__isl_give isl_schedule *isl_schedule_sequence(
	__isl_take isl_schedule *schedule1, __isl_take isl_schedule *schedule2);
__isl_give isl_schedule *isl_schedule_set(
	__isl_take isl_schedule *schedule1, __isl_take isl_schedule *schedule2);
__isl_give isl_schedule *isl_schedule_intersect_domain(
	__isl_take isl_schedule *schedule, __isl_take isl_union_set *domain);
__isl_give isl_schedule *isl_schedule_gist_domain_params(
	__isl_take isl_schedule *schedule, __isl_take isl_set *context);

__isl_give isl_schedule *isl_schedule_reset_user(
	__isl_take isl_schedule *schedule);
__isl_give isl_schedule *isl_schedule_align_params(
	__isl_take isl_schedule *schedule, __isl_take isl_space *space);
__isl_overload
__isl_give isl_schedule *isl_schedule_pullback_union_pw_multi_aff(
	__isl_take isl_schedule *schedule,
	__isl_take isl_union_pw_multi_aff *upma);
__isl_give isl_schedule *isl_schedule_expand(__isl_take isl_schedule *schedule,
	__isl_take isl_union_pw_multi_aff *contraction,
	__isl_take isl_schedule *expansion);

__isl_give isl_schedule *isl_schedule_read_from_file(isl_ctx *ctx, FILE *input);
__isl_constructor
__isl_give isl_schedule *isl_schedule_read_from_str(isl_ctx *ctx,
	const char *str);
__isl_give isl_printer *isl_printer_print_schedule(__isl_take isl_printer *p,
	__isl_keep isl_schedule *schedule);
void isl_schedule_dump(__isl_keep isl_schedule *schedule);
__isl_give char *isl_schedule_to_str(__isl_keep isl_schedule *schedule);

/* AutoSA Extended */
__isl_give isl_schedule *isl_schedule_dup(__isl_keep isl_schedule *sched);
/* AutoSA Extended */

#if defined(__cplusplus)
}
#endif

#endif


================================================
FILE: autosa_scripts/ppcg_changes/isl/schedule_node.h
================================================
#ifndef ISL_SCHEDULE_NODE_H
#define ISL_SCHEDULE_NODE_H

#include <isl/schedule_type.h>
#include <isl/union_set_type.h>
#include <isl/aff_type.h>
#include <isl/ast_type.h>
#include <isl/val_type.h>
#include <isl/space_type.h>
#include <isl/id_type.h>
#include <isl/set.h>

#if defined(__cplusplus)
extern "C" {
#endif

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_from_domain(
	__isl_take isl_union_set *domain);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_from_extension(
	__isl_take isl_union_map *extension);
__isl_give isl_schedule_node *isl_schedule_node_copy(
	__isl_keep isl_schedule_node *node);
__isl_null isl_schedule_node *isl_schedule_node_free(
	__isl_take isl_schedule_node *node);

__isl_export
isl_bool isl_schedule_node_is_equal(__isl_keep isl_schedule_node *node1,
	__isl_keep isl_schedule_node *node2);

isl_ctx *isl_schedule_node_get_ctx(__isl_keep isl_schedule_node *node);
__isl_subclass(isl_schedule_node)
enum isl_schedule_node_type isl_schedule_node_get_type(
	__isl_keep isl_schedule_node *node);
enum isl_schedule_node_type isl_schedule_node_get_parent_type(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_schedule *isl_schedule_node_get_schedule(
	__isl_keep isl_schedule_node *node);

__isl_export
isl_stat isl_schedule_node_foreach_descendant_top_down(
	__isl_keep isl_schedule_node *node,
	isl_bool (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user);
__isl_export
isl_bool isl_schedule_node_every_descendant(__isl_keep isl_schedule_node *node,
	isl_bool (*test)(__isl_keep isl_schedule_node *node, void *user),
	void *user);
__isl_export
isl_stat isl_schedule_node_foreach_ancestor_top_down(
	__isl_keep isl_schedule_node *node,
	isl_stat (*fn)(__isl_keep isl_schedule_node *node, void *user),
	void *user);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_map_descendant_bottom_up(
	__isl_take isl_schedule_node *node,
	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
		void *user), void *user);

__isl_export
isl_size isl_schedule_node_get_tree_depth(__isl_keep isl_schedule_node *node);
__isl_export
isl_bool isl_schedule_node_has_parent(__isl_keep isl_schedule_node *node);
__isl_export
isl_bool isl_schedule_node_has_children(__isl_keep isl_schedule_node *node);
__isl_export
isl_bool isl_schedule_node_has_previous_sibling(
	__isl_keep isl_schedule_node *node);
__isl_export
isl_bool isl_schedule_node_has_next_sibling(__isl_keep isl_schedule_node *node);
__isl_export
isl_size isl_schedule_node_n_children(__isl_keep isl_schedule_node *node);
__isl_export
isl_size isl_schedule_node_get_child_position(
	__isl_keep isl_schedule_node *node);
__isl_export
isl_size isl_schedule_node_get_ancestor_child_position(
	__isl_keep isl_schedule_node *node,
	__isl_keep isl_schedule_node *ancestor);
__isl_give isl_schedule_node *isl_schedule_node_get_child(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_get_shared_ancestor(
	__isl_keep isl_schedule_node *node1,
	__isl_keep isl_schedule_node *node2);

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_root(
	__isl_take isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_parent(
	__isl_take isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_ancestor(
	__isl_take isl_schedule_node *node, int generation);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_child(
	__isl_take isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_first_child(
	__isl_take isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_previous_sibling(
	__isl_take isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_next_sibling(
	__isl_take isl_schedule_node *node);

__isl_export
isl_bool isl_schedule_node_is_subtree_anchored(
	__isl_keep isl_schedule_node *node);

__isl_give isl_schedule_node *isl_schedule_node_group(
	__isl_take isl_schedule_node *node, __isl_take isl_id *group_id);

__isl_give isl_schedule_node *isl_schedule_node_sequence_splice_child(
	__isl_take isl_schedule_node *node, int pos);

__isl_give isl_space *isl_schedule_node_band_get_space(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_multi_union_pw_aff *isl_schedule_node_band_get_partial_schedule(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_map *isl_schedule_node_band_get_partial_schedule_union_map(
	__isl_keep isl_schedule_node *node);
enum isl_ast_loop_type isl_schedule_node_band_member_get_ast_loop_type(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_ast_loop_type(
	__isl_take isl_schedule_node *node, int pos,
	enum isl_ast_loop_type type);
enum isl_ast_loop_type isl_schedule_node_band_member_get_isolate_ast_loop_type(
	__isl_keep isl_schedule_node *node, int pos);
__isl_give isl_schedule_node *
isl_schedule_node_band_member_set_isolate_ast_loop_type(
	__isl_take isl_schedule_node *node, int pos,
	enum isl_ast_loop_type type);
__isl_export
__isl_give isl_union_set *isl_schedule_node_band_get_ast_build_options(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_set_ast_build_options(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *options);
__isl_export
__isl_give isl_set *isl_schedule_node_band_get_ast_isolate_option(
	__isl_keep isl_schedule_node *node);
__isl_export
isl_size isl_schedule_node_band_n_member(__isl_keep isl_schedule_node *node);
__isl_export
isl_bool isl_schedule_node_band_member_get_coincident(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_coincident(
	__isl_take isl_schedule_node *node, int pos, int coincident);
__isl_export
isl_bool isl_schedule_node_band_get_permutable(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_set_permutable(
	__isl_take isl_schedule_node *node, int permutable);

isl_stat isl_options_set_tile_scale_tile_loops(isl_ctx *ctx, int val);
int isl_options_get_tile_scale_tile_loops(isl_ctx *ctx);
isl_stat isl_options_set_tile_shift_point_loops(isl_ctx *ctx, int val);
int isl_options_get_tile_shift_point_loops(isl_ctx *ctx);

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_scale(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_scale_down(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_mod(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *mv);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_shift(
	__isl_take isl_schedule_node *node,
	__isl_take isl_multi_union_pw_aff *shift);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_tile(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes);
__isl_give isl_schedule_node *isl_schedule_node_band_sink(
	__isl_take isl_schedule_node *node);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_split(
	__isl_take isl_schedule_node *node, int pos);

__isl_export
__isl_give isl_set *isl_schedule_node_context_get_context(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_set *isl_schedule_node_domain_get_domain(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_map *isl_schedule_node_expansion_get_expansion(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_pw_multi_aff *isl_schedule_node_expansion_get_contraction(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_map *isl_schedule_node_extension_get_extension(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_set *isl_schedule_node_filter_get_filter(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_set *isl_schedule_node_guard_get_guard(
	__isl_keep isl_schedule_node *node);
__isl_give isl_id *isl_schedule_node_mark_get_id(
	__isl_keep isl_schedule_node *node);

isl_size isl_schedule_node_get_schedule_depth(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_set *isl_schedule_node_get_domain(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_set *isl_schedule_node_get_universe_domain(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_multi_union_pw_aff *
isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_pw_multi_aff *
isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(
	__isl_keep isl_schedule_node *node);
__isl_export
__isl_give isl_union_map *isl_schedule_node_get_prefix_schedule_union_map(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_map *isl_schedule_node_get_prefix_schedule_relation(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_map *isl_schedule_node_get_subtree_schedule_union_map(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_map *isl_schedule_node_get_subtree_expansion(
	__isl_keep isl_schedule_node *node);
__isl_give isl_union_pw_multi_aff *isl_schedule_node_get_subtree_contraction(
	__isl_keep isl_schedule_node *node);

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_context(
	__isl_take isl_schedule_node *node, __isl_take isl_set *context);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_partial_schedule(
	__isl_take isl_schedule_node *node,
	__isl_take isl_multi_union_pw_aff *schedule);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_filter(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_guard(
	__isl_take isl_schedule_node *node, __isl_take isl_set *context);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_mark(
	__isl_take isl_schedule_node *node, __isl_take isl_id *mark);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_sequence(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_set_list *filters);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_insert_set(
	__isl_take isl_schedule_node *node,
	__isl_take isl_union_set_list *filters);

__isl_give isl_schedule_node *isl_schedule_node_cut(
	__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *isl_schedule_node_delete(
	__isl_take isl_schedule_node *node);

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_order_before(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_order_after(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *filter);

__isl_export
__isl_give isl_schedule_node *isl_schedule_node_graft_before(
	__isl_take isl_schedule_node *node,
	__isl_take isl_schedule_node *graft);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_graft_after(
	__isl_take isl_schedule_node *node,
	__isl_take isl_schedule_node *graft);

__isl_give isl_schedule_node *isl_schedule_node_reset_user(
	__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *isl_schedule_node_align_params(
	__isl_take isl_schedule_node *node, __isl_take isl_space *space);

__isl_give isl_printer *isl_printer_print_schedule_node(
	__isl_take isl_printer *p, __isl_keep isl_schedule_node *node);
void isl_schedule_node_dump(__isl_keep isl_schedule_node *node);
__isl_give char *isl_schedule_node_to_str(__isl_keep isl_schedule_node *node);

/* AutoSA Extended */
__isl_export
enum autosa_loop_type isl_schedule_node_band_member_get_space_time(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_space_time(
	__isl_take isl_schedule_node *node, int pos, enum autosa_loop_type loop_type);
__isl_export
enum autosa_loop_type isl_schedule_node_band_member_get_pe_opt(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_pe_opt(
	__isl_take isl_schedule_node *node, int pos, enum autosa_loop_type loop_type);
__isl_export
int isl_schedule_node_band_member_get_sched_pos(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_sched_pos(
	__isl_take isl_schedule_node *node, int pos, int sched_pos);
__isl_export
void *isl_schedule_node_band_member_get_iter(
	__isl_keep isl_schedule_node *node, int pos);
__isl_export
__isl_give isl_schedule_node *isl_schedule_node_band_member_set_iter(
	__isl_take isl_schedule_node *node, int pos, void *iter);

__isl_export
__isl_take isl_schedule_node *isl_schedule_node_dup(
	__isl_keep isl_schedule_node *node);
/* AutoSA Extended */

#if defined(__cplusplus)
}
#endif

#endif


================================================
FILE: autosa_scripts/ppcg_changes/isl/vec.h
================================================
/*
 * Copyright 2008-2009 Katholieke Universiteit Leuven
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, K.U.Leuven, Departement
 * Computerwetenschappen, Celestijnenlaan 200A, B-3001 Leuven, Belgium
 */

#ifndef ISL_VEC_H
#define ISL_VEC_H

#include <stdio.h>

#include <isl/ctx.h>
#include <isl/val_type.h>
#include <isl/printer.h>

#if defined(__cplusplus)
extern "C" {
#endif

struct isl_vec;
typedef struct isl_vec isl_vec;

__isl_give isl_vec *isl_vec_alloc(isl_ctx *ctx, unsigned size);
__isl_give isl_vec *isl_vec_zero(isl_ctx *ctx, unsigned size);
__isl_give isl_vec *isl_vec_copy(__isl_keep isl_vec *vec);
__isl_null isl_vec *isl_vec_free(__isl_take isl_vec *vec);

isl_ctx *isl_vec_get_ctx(__isl_keep isl_vec *vec);

isl_size isl_vec_size(__isl_keep isl_vec *vec);
__isl_give isl_val *isl_vec_get_element_val(__isl_keep isl_vec *vec, int pos);
__isl_give isl_vec *isl_vec_set_element_si(__isl_take isl_vec *vec,
	int pos, int v);
__isl_give isl_vec *isl_vec_set_element_val(__isl_take isl_vec *vec,
	int pos, __isl_take isl_val *v);

isl_bool isl_vec_is_equal(__isl_keep isl_vec *vec1, __isl_keep isl_vec *vec2);
int isl_vec_cmp_element(__isl_keep isl_vec *vec1, __isl_keep isl_vec *vec2,
	int pos);

void isl_vec_dump(__isl_keep isl_vec *vec);
__isl_give isl_printer *isl_printer_print_vec(__isl_take isl_printer *printer,
	__isl_keep isl_vec *vec);

__isl_give isl_vec *isl_vec_ceil(__isl_take isl_vec *vec);
struct isl_vec *isl_vec_normalize(struct isl_vec *vec);
__isl_give isl_vec *isl_vec_set_si(__isl_take isl_vec *vec, int v);
__isl_give isl_vec *isl_vec_set_val(__isl_take isl_vec *vec,
	__isl_take isl_val *v);
__isl_give isl_vec *isl_vec_clr(__isl_take isl_vec *vec);
__isl_give isl_vec *isl_vec_neg(__isl_take isl_vec *vec);
__isl_give isl_vec *isl_vec_add(__isl_take isl_vec *vec1,
	__isl_take isl_vec *vec2);
__isl_give isl_vec *isl_vec_extend(__isl_take isl_vec *vec, unsigned size);
__isl_give isl_vec *isl_vec_zero_extend(__isl_take isl_vec *vec, unsigned size);
__isl_give isl_vec *isl_vec_concat(__isl_take isl_vec *vec1,
	__isl_take isl_vec *vec2);

__isl_give isl_vec *isl_vec_sort(__isl_take isl_vec *vec);

__isl_give isl_vec *isl_vec_read_from_file(isl_ctx *ctx, FILE *input);

__isl_give isl_vec *isl_vec_drop_els(__isl_take isl_vec *vec,
	unsigned pos, unsigned n);
__isl_give isl_vec *isl_vec_add_els(__isl_take isl_vec *vec, unsigned n);
__isl_give isl_vec *isl_vec_insert_els(__isl_take isl_vec *vec,
	unsigned pos, unsigned n);
__isl_give isl_vec *isl_vec_insert_zero_els(__isl_take isl_vec *vec,
	unsigned pos, unsigned n);
__isl_give isl_vec *isl_vec_move_els(__isl_take isl_vec *vec,
	unsigned dst_col, unsigned src_col, unsigned n);

/* AutoSA Extended */
__isl_give isl_vec *isl_vec_dup(__isl_keep isl_vec *vec);
/* AutoSA Extended */

#if defined(__cplusplus)
}
#endif

#endif


================================================
FILE: autosa_scripts/ppcg_changes/ppcg/files.txt
================================================
cpu.h
cuda.h
opencl.h
ppcg_options.h
ppcg_options.c
ppcg.c
ppcg.h
util.h
print.h
schedule.h
gpu.h


================================================
FILE: autosa_scripts/resource_model.py
================================================
import os
import json
import re
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.stats.mstats import gmean
from statistics import mean
import shutil
import math
import pprint
import argparse

# Helper functions to predict certain modules
def BRAM_predict_HLS(dw, depth, use_18K=0):
    """ Predict the resource usage of BRAM on Xilinx platforms.  

    Parameters
    ----------
    dw: int
        BRAM port width
    depth: int
        BRAM depth
    use_18K: int
        Force the estimator to use the BRAM18K model. (for HLS FIFOs)
    """
    if dw <= 18 or use_18K:
        alpha = np.ceil(float(dw) / 18)
        BRAM = alpha * np.ceil(float(depth) / 1024)   
    else:
        alpha = np.ceil(float(dw) / 36)
        BRAM = alpha * np.ceil(float(depth) / 512)    
        
    return BRAM

def URAM_predict_HLS(dw, depth):
    """ Predict the resource usage of URAM on Xilinx platforms.  

    Parameters
    ----------
    dw: int
        URAM port width
    depth: int
        URAM depth
    """
    alpha = np.ceil(float(dw) / 72)
    URAM = alpha * np.ceil(float(depth) / 4096)
    return URAM

def BRAM_array_predict_HLS(dw, depth, n_part):
    """ Predict the BRAM resource usage of arrays on Xilinx platform.  

    Parameters
    ----------
    dw: int
        BRAM port width (in bytes)
    depth: int
        BRAM depth
    n_part: int
        number of partitions
    """
    return n_part * BRAM_predict_HLS(dw * 8, np.ceil(float(depth) / n_part))

def FF_array_predict_HLS(dw, depth):
    """ Predict the FF resource usage of arrays on Xilinx platform.

    Parameters
    ----------
    dw: int
        BRAM port width (in bytes)
    depth : int
        BRAM depth
    """
    return dw * 8 * depth

def URAM_array_predict_HLS(dw, depth, n_part):
    return n_part * URAM_predict_HLS(dw * 8, np.ceil(float(depth) / n_part))

def FIFO_predict_xilinx(dw, depth):
    """ Predict the resource ussage of fifo modules on Xilinx platforms.
  

    Parameters
    ----------
    dw: int
        fifo data width
    depth: int
        fifo depth
    """
    DSP = 0
    if dw * depth <= 512:
        BRAM = 0
        FF = 5
        LUT = dw + 12
    else:
        BRAM = BRAM_predict_HLS(dw, depth, 1)        
    # In the current codegen, we will use SRL to implement FIFOs
    #    BRAM = 0
        FF = dw + 10
        LUT = int(0.9687 * dw + 13.982)

    return {'BRAM18K': BRAM, 'DSP': DSP, 'FF': FF, 'LUT': LUT}

def extract_axi_res_from_hls_rpt(rpt_path):
    """ Extract the resource usage for AXI modules from the HLS report in text format

    Parameters
    ----------
    rpt_path: str
        The path of HLS report

    Returns
    -------
    BRAM18K, FF, LUT
    """
    with open(rpt_path) as f:
        lines = f.readlines()
    BRAM18K_total = 0
    FF_total = 0
    LUT_total = 0
    for line in lines:
        if line.find('kernel0_gmem_') != -1:
            line = line.split('|')
            BRAM18K_total += float(line[3])
            FF_total += float(line[5])
            LUT_total += float(line[6])
    return BRAM18K_total, FF_total, LUT_total

def extract_design_info(design_dir, synth=0):
    """ Extract the design infomation.

    Load the design_info.json and design_info.dat under the diretory 'resource_est'.
    If synth is set to 1, load the HLS reports.
    Return a dictionary that contains all the information above.
    - FF: int
    - LUT: int
    - BRAM18K: int
    - DSP: int
    - URAM: int
    - fifos:
      - fifo_name:
        - fifo_cnt: int
        - fifo_width: int
        - fifo_depth: int
    - modules:
      - module_name:
        - module_cnt: int
        - FF, LUT, BRAM18K, URAM, DSP: int
        - data_pack_inter, data_pack_intra: int
        - ele_type: str
        - ele_size: int
        - local_buffers
        - unroll: int

    Parameters
    ----------
    design_dir: str
        The design directory.
    synth: int
        Is the design synthesized or not.
    """
    # Load the design info
    f_dir = f'{design_dir}/resource_est/design_info.json'
    with open(f_dir, 'r') as f:
        design_info = json.load(f)
    design_info['fifos'] = {}
    f_dir = f'{design_dir}/resource_est/design_info.dat'
    with open(f_dir, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip().split(':')
        if line[0] == 'fifo':
            fifo_name = line[1]
            fifo_cnt = int(line[2])
            fifo_w = int(line[3])
            fifo_depth = 2 # default                 
            design_info['fifos'][fifo_name] = {
                'fifo_cnt': fifo_cnt,
                'fifo_width': fifo_w,
                'fifo_depth': fifo_depth
            }
            if fifo_cnt == 0 and fifo_name in design_info['fifos']:
                design_info['fifos'].pop(fifo_name)
        elif line[0] == 'module':
            module_name = line[1]
            module_cnt = int(line[2])                        
            design_info['modules'][module_name]['module_cnt'] = module_cnt
            if module_cnt == 0 and module_name in design_info['modules']:
                design_info['modules'].pop(module_name)
    if synth:
        # Load the HLS project              
        hls_rpts = {}
        hls_prj_dir = f'{design_dir}/hls_prj'
        hls_rpts_dir = f'{hls_prj_dir}/solution1/syn/report'
        hls_rpt_names = os.listdir(hls_rpts_dir)
        hls_rpt_names = [r for r in hls_rpt_names if r.endswith('_csynth.xml')]
        for r in hls_rpt_names:
            with open(hls_rpts_dir + '/' + r, 'r') as f:
                tree = ET.parse(f)
                root = tree.getroot()
                module_name = r[:-11]
                # For duplicate modules, get rid of the digits suffix.
                while module_name[-1].isdigit():
                    module_name = module_name[:-1]
                hls_rpts[module_name] = root
        
        # Extract the resource info from the hls report
        for module in design_info['modules']:
            if module in hls_rpts:
                rpt = hls_rpts[module]
            elif f'{module}_wrapper' in hls_rpts:
                # It is possible the module is wrapped. 
                # Look for the wrapper module.
                rpt = hls_rpts[module + '_wrapper']
            else:
                # The module is inlined
                rpt = None

            if rpt:
                res = extract_resource_info_from_hls_rpt(rpt)
                design_info['modules'][module]['FF'] = res['FF']
                # Extract the FF storage if existing
                if "local_buffers" in design_info['modules'][module]:
                    local_buffers = design_info['modules'][module]['local_buffers']
                    for local_buffer in local_buffers:
                        if local_buffer['mem_type'] == 'FF':
                            design_info['modules'][module]['FF'] -= \
                                FF_array_predict_HLS(local_buffer['port_width'], \
                                                     local_buffer['buffer_depth'])                            
                design_info['modules'][module]['LUT'] = res['LUT']
                design_info['modules'][module]['BRAM18K'] = res['BRAM18K']
                design_info['modules'][module]['URAM'] = res['URAM']
                design_info['modules'][module]['DSP'] = res['DSP']
            else:
                # For inlined module, its resource usage is included in the parent module.
                design_info['modules'][module]['FF'] = None
                design_info['modules'][module]['LUT'] = None
                design_info['modules'][module]['BRAM18K'] = None
                design_info['modules'][module]['URAM'] = None
                design_info['modules'][module]['DSP'] = None                
        # Top module
        rpt = hls_rpts['kernel']
        res = extract_resource_info_from_hls_rpt(rpt) 
        # For the top module, we will also parse the report for BRAM usage of AXI modules
        top_module_rpt_name = 'kernel0_csynth.rpt'
        axi_bram, axi_ff, axi_lut = extract_axi_res_from_hls_rpt(f'{hls_rpts_dir}/{top_module_rpt_name}')
        res['BRAM18K'] -= axi_bram
        res['FF'] -= axi_ff
        res['LUT'] -= axi_lut

        design_info['FF'] = res['FF']
        design_info['LUT'] = res['LUT']
        design_info['BRAM18K'] = res['BRAM18K']
        design_info['URAM'] = res['URAM']
        design_info['DSP'] = res['DSP']
    else:
        for module in design_info['modules']:
            design_info['modules'][module]['FF'] = None
            design_info['modules'][module]['LUT'] = None
            design_info['modules'][module]['BRAM18K'] = None
            design_info['modules'][module]['URAM'] = None
            design_info['modules'][module]['DSP'] = None
        design_info['FF'] = None
        design_info['LUT'] = None
        design_info['BRAM18K'] = None
        design_info['URAM'] = None
        design_info['DSP'] = None

    return design_info

def extract_resource_info_from_hls_rpt(rpt):
    """ Extract the resource info from the HLS rpt.

    Parameters
    ----------
    rpt: 
        HLS report in XML format
    """
    res = {
        'BRAM18K': 0,
        'DSP': 0,
        'URAM': 0,
        'FF': 0,
        'LUT': 0
    }
    root = rpt
    for est in root.iter('AreaEstimates'):
        for child in est:
            if child.tag == 'Resources':
                for item in child:
                    if item.tag == 'BRAM_18K':
                        res['BRAM18K'] = int(item.text)
                    elif item.tag == 'URAM':
                        res['URAM'] = int(item.text)
                    elif item.tag == 'DSP48E':
                        res['DSP'] = int(item.text)    
                    elif item.tag == 'FF':
                        res['FF'] = int(item.text)   
                    elif item.tag == 'LUT':
                        res['LUT'] = int(item.text)                        

    return res

def convert_design_infos_to_df(design_infos):
    """ Convert the design infos into a dataframe.

    Parameters
    ----------
    design_infos: list
        A list containing all design informations.
    """
    modules = []
    fifos = []
    for design_info in design_infos:
        fs = design_info['fifos']
        ms = design_info['modules']
        for f in fs:
            if f not in fifos:
                fifos.append(f)
        for m in ms:
            if m not in modules and m.find('wrapper') == -1:
                modules.append(m)

    # Reorganize the design information to a dictionary
    info_dict = {}
    info_dict['FF'] = []
    info_dict['LUT'] = []
    info_dict['DSP'] = []
    info_dict['BRAM18K'] = []
    info_dict['URAM'] = []
    for fifo in fifos:
        info_dict[fifo + '_fifo_cnt'] = []
        info_dict[fifo + '_fifo_width'] = []
        info_dict[fifo + '_fifo_depth'] = []
    for module in modules:
        # IO_module: 
        #   module_cnt, data_pack_inter, data_pack_intra, ele_type, ele_size
        #   [local_buffers_local_X]_{port_width, buffer_depth, partition_number}
        # PE_module: 
        #   module_cnt, unroll
        if module.find('IO') != -1:
            # IO module
            info_dict[module + '_data_pack_inter'] = []
            info_dict[module + '_data_pack_intra'] = []
            info_dict[module + '_ele_size'] = []
        else:
            # PE module
            info_dict[module + '_unroll'] = []
        
        info_dict[module + '_module_cnt'] = []
        info_dict[module + '_FF'] = []
        info_dict[module + '_LUT'] = []
        info_dict[module + '_BRAM18K'] = []
        info_dict[module + '_URAM'] = []
        info_dict[module + '_DSP'] = []

    for design_info in design_infos:
        # FF, LUT, BRAM, DSP
        info_dict['FF'].append(design_info['FF'])
        info_dict['LUT'].append(design_info['LUT'])
        info_dict['DSP'].append(design_info['DSP'])
        info_dict['BRAM18K'].append(design_info['BRAM18K'])
        info_dict['URAM'].append(design_info['URAM'])

        fs = design_info['fifos']
        ms = design_info['modules']
        for fifo in fifos:
            if fifo in fs:
                info_dict[fifo + '_fifo_cnt'].append(fs[fifo]['fifo_cnt'])
                info_dict[fifo + '_fifo_width'].append(fs[fifo]['fifo_width'])
                info_dict[fifo + '_fifo_depth'].append(fs[fifo]['fifo_depth'])
            else:
                info_dict[fifo + '_fifo_cnt'].append(None)
                info_dict[fifo + '_fifo_width'].append(None)
                info_dict[fifo + '_fifo_depth'].append(None)
    
        for module in modules:
            if module.find('IO') != -1:
                # IO module
                if module in ms:
                    info_dict[module + '_module_cnt'].append(ms[module]['module_cnt'])
                    info_dict[module + '_data_pack_inter'].append(ms[module]['data_pack_inter'])
                    info_dict[module + '_data_pack_intra'].append(ms[module]['data_pack_intra'])
                    info_dict[module + '_ele_size'].append(ms[module]['ele_size'])
                else:
                    info_dict[module + '_module_cnt'].append(None)
                    info_dict[module + '_data_pack_inter'].append(None)
                    info_dict[module + '_data_pack_intra'].append(None)
                    info_dict[module + '_ele_size'].append(None)
            else:
                # PE module
                if module in ms:
                    info_dict[module + '_module_cnt'].append(ms[module]['module_cnt'])
                    info_dict[module + '_unroll'].append(ms[module]['unroll'])
                else:
                    info_dict[module + '_module_cnt'].append(None)
                    info_dict[module + '_unroll'].append(None)
      
            if module in ms:
                info_dict[module + '_FF'].append(ms[module]['FF'])
                info_dict[module + '_LUT'].append(ms[module]['LUT'])
                info_dict[module + '_BRAM18K'].append(ms[module]['BRAM18K'])
                info_dict[module + '_URAM'].append(ms[module]['URAM'])
                info_dict[module + '_DSP'].append(ms[module]['DSP'])
            else:
                info_dict[module + '_FF'].append(None)
                info_dict[module + '_LUT'].append(None)
                info_dict[module + '_BRAM18K'].append(None)
                info_dict[module + '_URAM'].append(None)
                info_dict[module + '_DSP'].append(None)

    df = pd.DataFrame(info_dict)
    return modules, fifos, df 

def df_feature_extract(df, module):
    """ Expand the dataframe to include new features for the module.

    Parameters
    ----------
    df: dataframe
    module: str
    """
    if module.find('IO') != -1:
        df[module + '_data_pack_inter/' + module + '_data_pack_intra'] = \
            df.apply(lambda row: float(row[module + '_data_pack_inter']) / float(row[module + '_data_pack_intra']), axis = 1)
        #df[module + '_data_pack_inter*' + module + '_ele_size'] = \
        #    df.apply(lambda row: float(row[module + '_data_pack_inter']) * float(row[module + '_ele_size']), axis = 1)

    return df

def get_feature_set(module):
    """ Exatract the feature set for the resource models.

    Parameters
    ----------
    module: str
        Module name.
    """
    feature_set = []
    if 'IO' in module:
        feature_set.append(f'{module}_data_pack_inter')
        feature_set.append(f'{module}_data_pack_inter/{module}_data_pack_intra')
    else:
        feature_set.append(f'{module}_unroll')
    return feature_set

def train(df, modules, fifos, design_infos, work_dir, logger):
    """ Train the resource models for each module.

    Parameters
    ----------
    df: dataframe
        A dataframe that containing all designs
    modules: list
        Module name list.
    fifos: list
        FIFO name list.
    design_infos: list
        A list containing all design informations.
    work_dir: str
        Directory to save the trained models.
    logger:
        Logger.
    """
    # Split the training set and validation set.
    feature_set = []
    pred_set = []
    for module in modules:
        # Expand the dataframe if necessary        
        df = df_feature_extract(df, module)
        feature_set += get_feature_set(module)        
        pred_set.append(module + '_FF')
        pred_set.append(module + '_LUT')
        pred_set.append(module + '_BRAM18K')
        pred_set.append(module + '_URAM')
        pred_set.append(module + '_DSP')

    X = df.loc[:, feature_set]
    y = df.loc[:, pred_set]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    logger.info(f'#Training samples: {X_train.shape[0]}')
    logger.info(f'#Validation samples: {X_test.shape[0]}')

    # Evaluation metrics
    FF_mape = []
    LUT_mape = []
    DSP_mape = []
    BRAM18K_mape = []
    URAM_mape = []    
    
    for module in modules:
        logger.info('Training resource model for module: ' + module)
        feature_set = get_feature_set(module)

        # FF
        pred_set = [module + '_FF']
        y_train_module = y_train.loc[:, pred_set]        
        y_train_module = y_train_module.dropna()        
        X_train_module = X_train.loc[y_train_module.index, feature_set]                
        if X_train_module.shape[0] > 0:
            model = LinearRegression()
            model.fit(X_train_module.to_numpy(), y_train_module.to_numpy())
            model_name = module + '_FF_model'
            joblib_file = work_dir + '/' + model_name + '.pkl'
            joblib.dump(model, joblib_file)
        # Validate the accuracy
        y_test_module = y_test.loc[:, pred_set]
        y_test_module = y_test_module.dropna()
        X_test_module = X_test.loc[y_test_module.index, feature_set]        
        if X_test_module.shape[0] > 0:
            y_pred_module = model.predict(X_test_module.to_numpy())        
            y_test_module = y_test_module.to_numpy()
            logger.info('======== FF ========')
            logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_module, y_pred_module)}')
            FF_mape.append(mean_absolute_percentage_error(y_test_module, y_pred_module))

        # LUT
        pred_set = [module + '_LUT']
        y_train_module = y_train.loc[:, pred_set]
        y_train_module = y_train_module.dropna()
        X_train_module = X_train.loc[y_train_module.index, feature_set]        
        if X_train_module.shape[0] > 0:
            model = LinearRegression()
            model.fit(X_train_module.to_numpy(), y_train_module.to_numpy())
            model_name = module + '_LUT_model'
            joblib_file = work_dir + '/' + model_name + '.pkl'
            joblib.dump(model, joblib_file)
        # Validate the accuracy
        y_test_module = y_test.loc[:, pred_set]
        y_test_module = y_test_module.dropna()
        X_test_module = X_test.loc[y_test_module.index, feature_set]        
        if X_test_module.shape[0] > 0:
            y_pred_module = model.predict(X_test_module.to_numpy())        
            y_test_module = y_test_module.to_numpy()
            logger.info('======== LUT ========')
            logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_module, y_pred_module)}')
            LUT_mape.append(mean_absolute_percentage_error(y_test_module, y_pred_module))

        # DSP
        pred_set = [module + '_DSP']
        y_train_module = y_train.loc[:, pred_set]
        y_train_module = y_train_module.dropna()
        X_train_module = X_train.loc[y_train_module.index, feature_set]
        if X_train_module.shape[0] > 0:
            model = LinearRegression()
            model.fit(X_train_module.to_numpy(), y_train_module.to_numpy())
            model_name = module + '_DSP_model'
            joblib_file = work_dir + '/' + model_name + '.pkl'
            joblib.dump(model, joblib_file)
        # Validate the accuracy
        y_test_module = y_test.loc[:, pred_set]
        y_test_module = y_test_module.dropna()
        X_test_module = X_test.loc[y_test_module.index, feature_set]        
        if X_test_module.shape[0] > 0:
            y_pred_module = model.predict(X_test_module.to_numpy())        
            y_test_module = y_test_module.to_numpy()        
            logger.info('======== DSP ========')
            logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_module, y_pred_module)}')
            DSP_mape.append(mean_absolute_percentage_error(y_test_module, y_pred_module))

        # BRAM18K
        pred_set = [module + '_BRAM18K']
        y_test_module = y_test.loc[:, pred_set]        
        y_test_module = y_test_module.dropna()
        X_test_module = X_test.loc[y_test_module.index, feature_set]        
        if X_test_module.shape[0] > 0:
            y_pred_module = np.zeros((y_test_module.shape[0], 1), dtype=float)
            cnt = 0
            for index, row in y_test_module.iterrows():            
                design_info = design_infos[index]
                BRAM_usage = 0
                if "local_buffers" in design_info['modules'][module]:
                    local_buffers = design_info['modules'][module]['local_buffers']
                    for local_buffer in local_buffers:
                        if local_buffer['mem_type'] == 'BRAM':
                            if 'array_map' in local_buffer:
                                # For horizontal mapping, we will merge two ping/pong buffers to one
                                BRAM_usage += BRAM_array_predict_HLS(local_buffer['port_width'], \
                                    local_buffer['buffer_depth'] * 2, local_buffer['partition_number']) / 2
                            else:
                                BRAM_usage += BRAM_array_predict_HLS(local_buffer['port_width'], \
                                    local_buffer['buffer_depth'], local_buffer['partition_number'])                                  

                y_pred_module[cnt] = BRAM_usage
                cnt += 1

            y_test_module = y_test_module.to_numpy()
            logger.info('======== BRAM18K ========')
            logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_module, y_pred_module)}')
            BRAM18K_mape.append(mean_absolute_percentage_error(y_test_module, y_pred_module))

        # URAM
        pred_set = [module + '_URAM']
        y_test_module = y_test.loc[:, pred_set]        
        y_test_module = y_test_module.dropna()
        X_test_module = X_test.loc[y_test_module.index, feature_set]     
        if X_test_module.shape[0] > 0:           
            y_pred_module = np.zeros((y_test_module.shape[0], 1), dtype=float)
            cnt = 0
            for index, row in y_test_module.iterrows():
                design = 'design' + str(index)
                design_info = design_infos[index]
                URAM_usage = 0
                if "local_buffers" in design_info['modules'][module]:
                    local_buffers = design_info['modules'][module]['local_buffers']
                    for local_buffer in local_buffers:
                        if local_buffer['mem_type'] == 'URAM':
                            BRAM_usage += URAM_array_predict_HLS(local_buffer['port_width'], \
                                local_buffer['buffer_depth'], local_buffer['partition_number'])
                y_pred_module[cnt] = URAM_usage
                cnt += 1

            y_test_module = y_test_module.to_numpy()
            logger.info('======== URAM ========')
            logger.info(f'Mean Absolute Error: {metrics.mean_absolute_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Squared Error: {metrics.mean_squared_error(y_test_module, y_pred_module)}')
            logger.info(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test_module, y_pred_module)}')
            URAM_mape.append(mean_absolute_percentage_error(y_test_module, y_pred_module))
        
    logger.info('======== Module-Level Resource Model Validation Results ========')
    logger.info('FF Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(FF_mape)))
    logger.info('LUT Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(LUT_mape)))
    logger.info('DSP Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(DSP_mape)))
    logger.info('BRAM18K Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(BRAM18K_mape)))
    logger.info('URAM Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(URAM_mape)))

    # Validate on the whole design.
    df_test = df.loc[y_test.index.values.tolist(), :]
    FF_design_mape = []
    LUT_design_mape = []
    DSP_design_mape = []
    BRAM18K_design_mape = []
    URAM_design_mape = []

    for index, row in df_test.iterrows():
        #print(index)
        design_info = design_infos[index]
        df_design = df_test.loc[[index], :]
        res = predict_design_resource_usage(df_design, modules, fifos, design_info, work_dir)                 

        #print(design_info['BRAM18K'], res['BRAM18K'])
        #print(design_info['FF'], res['FF'])
        #print(design_info['LUT'], res['LUT'])

        FF_mape = mean_absolute_percentage_error(float(design_info['FF']), res['FF'])        
        LUT_mape = mean_absolute_percentage_error(float(design_info['LUT']), res['LUT'])
        DSP_mape = mean_absolute_percentage_error(float(design_info['DSP']), res['DSP'])        
        BRAM18K_mape = mean_absolute_percentage_error(float(design_info['BRAM18K']), res['BRAM18K'])
        URAM_mape = mean_absolute_percentage_error(float(design_info['URAM']), res['URAM'])

        FF_design_mape.append(FF_mape)
        LUT_design_mape.append(LUT_mape)
        DSP_design_mape.append(DSP_mape)
        BRAM18K_design_mape.append(BRAM18K_mape)
        URAM_design_mape.append(URAM_mape)

    logger.info('======== Design-Level Resource Model Validation Results ========')
    logger.info('FF Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(FF_design_mape)))
    logger.info('LUT Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(LUT_design_mape)))
    logger.info('DSP Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(DSP_design_mape)))
    logger.info('BRAM18K Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(BRAM18K_design_mape)))
    logger.info('URAM Mean Absoulate Percentage Error (Arith. Mean): %.2f%%' %(mean(URAM_design_mape)))    

def predict_design_resource_usage(df, modules, fifos, design_info, prj_dir, \
    target=['FF', 'LUT', 'DSP', 'BRAM18K', 'URAM']):
    """ Predict the resource usage for a single design on Xilinx platforms

    Parameters
    ----------
    df: dataframe
        A dataframe storing the information for the current design.
    modules: list
        A list containing all module names.
    fifos: list
        A list containing all FIFO names.
    design_info: dict
        A dictionary containing the design information.
    prj_dir: str
        Directory to the resource models.    
    target: list
        Resource types to predict.
    """
    resource = {'FF': 0, 'LUT': 0, 'DSP': 0, 'BRAM18K': 0, 'URAM': 0}    
    resource_all = {}

    # Predict FIFOs
    for fifo in fifos:
        if fifo in design_info['fifos']:
            # Query the library to get the data
            fifo_w = design_info['fifos'][fifo]['fifo_width'] * 8
            fifo_depth = design_info['fifos'][fifo]['fifo_depth']
            resource_info = FIFO_predict_xilinx(fifo_w, fifo_depth)
            FF = resource_info['FF']
            LUT = resource_info['LUT']
            BRAM = resource_info['BRAM18K']
            URAM = 0
            DSP = resource_info['DSP']
            resource_all[fifo] = {
                'FF': FF, 'LUT': LUT, 'BRAM18K': BRAM, 'URAM': URAM, 'DSP': DSP, \
                'n': design_info['fifos'][fifo]['fifo_cnt']}

    # Predict modules
    for module in modules:
        if module in design_info['modules']:
            df = df_feature_extract(df, module)
            module_feature_set = get_feature_set(module)

            FF = 0
            if 'FF' in target:
                # FF
                X = df.loc[:, module_feature_set]
                model_name = module + '_FF_model'
                joblib_file = prj_dir + '/' + model_name + '.pkl'
                if os.path.isfile(joblib_file):
                    model = joblib.load(joblib_file)
                    FF = np.asscalar(model.predict(X.to_numpy()))
                    # Add back the FF arrays if existing
                    if "local_buffers" in design_info['modules'][module]:
                        local_buffers = design_info['modules'][module]['local_buffers']
                        for local_buffer in local_buffers:
                            if local_buffer['mem_type'] == 'FF':
                                FF += FF_array_predict_HLS(local_buffer['port_width'], \
                                                           local_buffer['buffer_depth'])
            LUT = 0
            if 'LUT' in target:
                # LUT
                X = df.loc[:, module_feature_set]
                model_name = module + '_LUT_model'
                joblib_file = prj_dir + '/' + model_name + '.pkl'
                if os.path.isfile(joblib_file):
                    model = joblib.load(joblib_file)
                    LUT = np.asscalar(model.predict(X.to_numpy()))

            DSP = 0
            if 'DSP' in target:
                # DSP
                X = df.loc[:, module_feature_set]
                model_name = module + '_DSP_model'
                joblib_file = prj_dir + '/' + model_name + '.pkl'
                if os.path.isfile(joblib_file):
                    model = joblib.load(joblib_file)
                    DSP = np.asscalar(model.predict(X.to_numpy()))

            BRAM = 0
            if 'BRAM18K' in target:
                # BRAM                
                if 'local_buffers' in design_info['modules'][module]:
                    local_buffers = design_info['modules'][module]['local_buffers']
                    for local_buffer in local_buffers:
                        if local_buffer['mem_type'] == 'BRAM':
                            if 'array_map' in local_buffer:
                                # For horizontal mapping, we will merge two ping/pong buffers to one
                                BRAM += BRAM_array_predict_HLS(local_buffer['port_width'], \
                                    local_buffer['buffer_depth'] * 2, local_buffer['partition_number']) / 2
                            else:
                                BRAM += BRAM_array_predict_HLS(local_buffer['port_width'], \
                                    local_buffer['buffer_depth'], local_buffer['partition_number'])                            

            #if BRAM > 0:
            #    print(module, BRAM)

            URAM = 0
            if 'URAM' in target:
                # URAM                
                if 'local_buffers' in design_info['modules'][module]:
                    local_buffers = design_info['modules'][module]['local_buffers']
                    for local_buffer in local_buffers:
                        if local_buffer['mem_type'] == 'URAM':
                            URAM += URAM_array_predict_HLS(local_buffer['port_width'], \
                                local_buffer['buffer_depth'], local_buffer['partition_number'])

            resource_all[module] = {
                'FF': FF, 'LUT': LUT, 'BRAM18K': BRAM, 'URAM': URAM, 'DSP': DSP, \
                'n': design_info['modules'][module]['module_cnt']}        

    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(resource_all)

    # Aggregate the resource
    for inst in resource_all:
        # For FF/LUT/DSP prediction, if the module contains inner module, skip it.
        #is_outer_module = 0
        #if inst.find('boundary') != -1:
        #    if inst[:-9] + '_inter_trans' in resource_all:
        #        is_outer_module = 1
        #else:
        #    if inst + '_inter_trans' in resource_all:
        #        is_outer_module = 1
        is_inner_module = 0
        if inst.find('inter_trans') != -1 or inst.find('intra_trans') != -1:
            is_inner_module = 1
        #if not is_outer_module:
        #    resource['FF'] += resource_all[inst]['FF'] * resource_all[inst]['n']
        #    resource['LUT'] += resource_all[inst]['LUT'] * resource_all[inst]['n']
        #    resource['DSP'] += resource_all[inst]['DSP'] * resource_all[inst]['n']
        if is_inner_module:
            continue

        resource['FF'] += resource_all[inst]['FF'] * resource_all[inst]['n']
        resource['LUT'] += resource_all[inst]['LUT'] * resource_all[inst]['n']
        resource['DSP'] += resource_all[inst]['DSP'] * resource_all[inst]['n']
        resource['BRAM18K'] += resource_all[inst]['BRAM18K'] * resource_all[inst]['n']
        resource['URAM'] += resource_all[inst]['URAM'] * resource_all[inst]['n']

    ret = {}
    for r in resource:
        if r in target:
            ret[r] = int(resource[r])
        else:
            ret[r] = 0

    return ret

def mean_absolute_percentage_error(y_true, y_pred):    
    if isinstance(y_true, np.ndarray) and isinstance(y_pred, np.ndarray):
        error = np.divide((y_true - y_pred), y_true, out=(-y_pred), where=y_true!=0)
        return np.mean(np.abs(error)) * 100    
    else:    
        # scalar
        if y_true == 0:
            return abs(y_pred) * 100
        else:            
            return abs((y_true - y_pred) / y_true) * 100

def resource_valid(res, hw_info, range, target):
    """ Test if the resource usage is valid.

    Parameters
    ----------
    res: dict
        A dict containing the resource usage of the current design.
    hw_info: dict
        A dict containing the hardware platform information.
    thres: dict
        A dict containing the resource threshold.
    target: list
        A list containing the hw resource target to predict.

    Returns
    -------
    ret: boolean
    """
    for r in res:
        if r in target:
            usage = res[r]
            if usage > hw_info[r] * range[r][1]:
                return False
            if usage < hw_info[r] * range[r][0]:
                return False
    return True

def compute_res_util_score(res, hw_info):
    """ Compute a score for the current design utilization.

    We put different weights for different types of resource.
    URAM, DSP, BRAM18K: 0.3
    LUT: 0.2
    FF: 0.1
    """
    score = 0
    if 'FF' in res:
        score += 0.1 * float(int(res['FF'])) / hw_info['FF']
    if 'LUT' in res:
        score += 0.2 * float(int(res['LUT'])) / hw_info['LUT']
    if 'BRAM18K' in res:
        score += 0.3 * float(int(res['BRAM18K'])) / hw_info['BRAM18K']
    if 'DSP' in res:
        score += 0.3 * float(int(res['DSP'])) / hw_info['DSP']
    if 'URAM' in res:
        score += 0.3 * float(int(res['URAM'])) / hw_info['URAM']

    return score

def unit_test_predict_design_resource(design_dir, hw_info, model_path):
    design_info = extract_design_info(design_dir, 0)
    modules, fifos, df = convert_design_infos_to_df([design_info])
    kernel_id = design_info['kernel_id']        
    res_model_path = f'{model_path}/kernel{kernel_id}'
    res = predict_design_resource_usage(
        df, modules, fifos, design_info,
        res_model_path)
    # compute the ratio
    print(f"FF: {res['FF']}/{hw_info['FF']} ({res['FF']/hw_info['FF']:.2f})")
    print(f"LUT: {res['LUT']}/{hw_info['LUT']} ({res['LUT']/hw_info['LUT']:.2f})")
    print(f"BRAM18K: {res['BRAM18K']}/{hw_info['BRAM18K']} ({res['BRAM18K']/hw_info['BRAM18K']:.2f})")
    print(f"DSP: {res['DSP']}/{hw_info['DSP']} ({res['DSP']/hw_info['DSP']:.2f})")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="==== AutoSA Resource Model ====")
    parser.add_argument('-d', required=True, help='design directory')
    parser.add_argument('-i', required=True, help='hardware info')
    parser.add_argument('-m', required=True, help='resource model path')

    args = parser.parse_args()
    with open(args.i, 'r') as f:
        hw_info = json.load(f)
    unit_test_predict_design_resource(args.d, hw_info, args.m)

================================================
FILE: autosa_scripts/tapa_scripts/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.13)
cmake_policy(SET CMP0076 NEW)

project(kernel)

add_executable(kernel)
target_sources(kernel PRIVATE kernel_host.cpp kernel_kernel.cpp)
target_link_libraries(kernel PUBLIC tapa::tapa)
target_compile_features(kernel PUBLIC cxx_std_11)
include_directories(/opt/tools/xilinx/Vitis_HLS/2020.2/include)

add_test(NAME kernel COMMAND kernel)

find_package(gflags REQUIRED)
find_package(TAPA REQUIRED)
find_package(FRT REQUIRED)
set(TAPA tapa::tapa)

find_package(SDx)
if(SDx_FOUND)
  add_tapa_target(
    kernel-hw-xo
    INPUT kernel_kernel.cpp
    FRT_INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/kernel.frt.cpp
    TOP kernel0
    PLATFORM xilinx_u250_xdma_201830_2)

  add_xocc_hw_link_targets(
    ${CMAKE_CURRENT_BINARY_DIR}
    INPUT kernel-hw-xo
    HW_EMU_XCLBIN
    hw_emu_xclbin
    HW_XCLBIN
    hw_xclbin)

  add_executable(kernel-frt)
  target_include_directories(kernel-frt PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
  target_sources(kernel-frt PRIVATE kernel_host.cpp
                                  ${CMAKE_CURRENT_BINARY_DIR}/kernel.frt.cpp)
  target_link_libraries(kernel-frt PRIVATE ${TAPA} frt::frt)

  add_custom_target(
    kernel-cosim
    COMMAND TAPAB=$<TARGET_PROPERTY:${hw_emu_xclbin},FILE_NAME>
            $<TARGET_FILE:kernel-frt>
    DEPENDS kernel-frt ${hw_emu_xclbin}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  add_custom_target(
    kernel-hw
    COMMAND TAPAB=$<TARGET_PROPERTY:${hw_xclbin},FILE_NAME>
            $<TARGET_FILE:kernel-frt>
    DEPENDS kernel-frt ${hw_xclbin}
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

  add_test(NAME kernel-cosim COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}
                                   --target kernel-cosim)
endif()


================================================
FILE: autosa_scripts/tuner/constraint.py
================================================
import json

class Constraint(object):
    def __init__(self, cst_path):
        with open(cst_path) as f:
            data = json.load(f)
        self.hw_cst = {}
        for res in data:
            self.hw_cst[res] = data[res]["total"] * data[res]["ratio"]        
            self.hw_cst[f'{res}_total'] = data[res]["total"]

    def __repr__(self):
        ret = ""
        ret += f"b{int(self.hw_cst['BRAM18K'])}"
        ret += f"d{int(self.hw_cst['DSP'])}"
        return ret    

================================================
FILE: autosa_scripts/tuner/cst/hw_cst.json
================================================
{
  "BRAM18K": {
    "total": 5376,
    "ratio": 0.7
  },
  "DSP": {
    "total": 12288,
    "ratio": 0.7
  },
  "FF": {
    "total": 3456000,
    "ratio": 0.7
  },
  "LUT": {
    "total": 1728000,
    "ratio": 0.7
  },
  "URAM": {
    "total": 1280,
    "ratio": 0.7
  }
}


================================================
FILE: autosa_scripts/tuner/design.py
================================================
import numpy as np
import json
import sys
import os
from numpy import ceil, floor

class Design(object):
    def __init__(self, name):
        self.name = name # design name        
        self.est_resource_func = None
        self.est_latency_func = None
        self.infer_params_func = None
        self.random_sampling_func = None
        self.bound_check_func = None
        self.params_config = None      
        self.desp = None  

    def print_resource_est_func(self, f, desp):
        f.write("def est_resource(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        f.write("\t# DSP\n")
        f.write(f"\tDSP = {desp['compute']['PE']['num']} * ")
        f.write(f"{desp['compute']['PE']['unroll_factor']} * ")
        if desp["compute"]["PE"]["ele_type"] == "float":
            f.write(f"5\n")
        else:
            raise RuntimeError(f"Unsupported data type {desp['compute']['PE']['ele_type']} in resource estimation")        
        f.write("\n")

        # Print function est_BRAM18K
        f.write("\t# BRAM18K\n")
        f.write("\tdef est_BRAM18K(ele_size, ele_num, pack):\n")
        f.write(f"\t\treturn ceil(ele_size*8*pack / 18) * ceil(ele_num/pack/1024)\n\n")

        # Check if drain module can be merged.
        # Note: It should be supported in the codegen of AutoSA. However, currently, 
        # we move it here in the tuner.
        out_module = {}
        out_drain_module = {}
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.endswith('_out'):
                item = {'buf_size': module_mem['buf_size'], 
                        'num': module_mem['num']}
                if module.find('drain') != -1:
                    item['merged'] = 0
                    out_drain_module[module_mem['array']] = item
                else:                    
                    if module_mem['array'] not in out_module:
                        out_module[module_mem['array']] = [item]
                    else:
                        out_module[module_mem['array']].append(item)
        for array in out_drain_module:
            if array in out_module:
                for m in out_module[array]:                
                    if m['buf_size'] == out_drain_module[array]['buf_size'] and \
                       m['num'] == out_drain_module[array]['num']:
                       out_drain_module[array]['merged'] = 1

        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            f.write(f"\t{module}_unit_memory = est_BRAM18K({module_mem['ele_size']}, ")
            f.write(f"{module_mem['buf_size']}, ")
            if "data_pack_factor" in module_mem:
                f.write(f"{module_mem['data_pack_factor']})\n")
            else:
                f.write(f"1)\n")        
        #f.write("\tprint(A_IO_L1_in_unit_memory)\n")
        #f.write("\tprint(A_IO_L2_in_unit_memory)\n")
        #f.write("\tprint(B_IO_L2_in_unit_memory)\n")        
        #f.write("\tprint(PE_unit_memory)\n")
        #f.write("\tprint(C_1_IO_L2_out_unit_memory)\n")        
        #f.write("\tprint(C_drain_IO_L1_out_unit_memory)\n")

        f.write("\tBRAM18K = ")
        is_first = True
        for module in desp["memory"]:
            module_mem = desp["memory"][module]
            if module.find('drain') != -1 and out_drain_module[module_mem['array']]['merged'] == 1:
                continue
            if not is_first:
                f.write(" + ")            
            f.write(f"{module}_unit_memory")
            if module_mem["double_buffer"]:
                f.write(f" * 2")
            else:
                f.write(f" * 1")
            f.write(f" * {module_mem['num']}")            
            is_first = False            
        f.write("\n\n")

        #for module in desp["memory"]:
        #    module_mem = desp["memory"][module]
        #    f.write(f"\tprint({module_mem['num']})\n")

        f.write("\treturn {\"DSP\": DSP, \"BRAM18K\": BRAM18K}\n")
        f.write("\n")

    def print_latency_est_func(self, f, desp):
        f.write("def est_latency(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        def extract_latency_expr(lat, info):
            ret = ""
            if lat["type"] == "block":
                info["has_for_child"] = 0
                no_for_child = True
                is_first = True
                ret += "("
                for child in lat["child"]:
                    if not is_first:
                        ret += " + "                    
                    ret += extract_latency_expr(child, info)                    
                    if info["has_for_child"] == 1:
                        no_for_child = False
                    is_first = False
                ret += ")"
                if no_for_child:
                    ret = "1"
            elif lat["type"] == "for":                
                child = lat["child"]
                expr = extract_latency_expr(child, info)                
                if info["valid"]:
                    ret = lat["bounds"][1] + " * " + expr
                else:
                    ret = expr
                info["has_for_child"] = 1
            elif lat["type"] == "mark":      
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = True
                if lat["content"] == "simd":
                    if info["valid"]:
                        ret = "1"
                    else:
                        ret = "0"
                else:
                    child = lat["child"]
                    ret = extract_latency_expr(child, info)
                if info["under_mark"] and lat["content"] == info["under_mark"]:
                    info["valid"] = False
            elif lat["type"] == "user":
                user_expr = lat["child"]["user_expr"]
                if 'inter_intra' in user_expr or 'intra_inter' in user_expr:                    
                    if user_expr[:-2].split(".")[-1] == "1":
                        double_buffer = 1
                    else:
                        double_buffer = 0                    
                    # Plug in submodule latency
                    if f"{info['name']}_inter" in info["modules"]:
                        inter_expr = info["modules"][f"{info['name']}_inter"]
                    else:
                        inter_expr = None
                    if f"{info['name']}_intra" in info["modules"]:
                        intra_expr = info["modules"][f"{info['name']}_intra"]
                    else:
                        intra_expr = None

                    if inter_expr and intra_expr:
                        if info["in"] == 1 or info["in"] == 0:
                            ret = inter_expr
                        else:
                            if double_buffer:
                                ret = f"max({inter_expr}, {intra_expr})"
                            else:
                                ret = f"({inter_expr} + {intra_expr})"
                        info["has_for_child"] = 1
                    else:                        
                        ret = "1"                        
                    if not info["valid"]:
                        ret = "0"
                elif "inter_trans" in user_expr:
                    # Plug in submodule latency
                    if f"{info['name']}_inter" in info["modules"]:
                        ret = info["modules"][f"{info['name']}_inter"]
                    else:
                        ret = "1"
                    if not info["valid"]:
                        ret = "0"
                elif "intra_trans" in user_expr:
                    # Plug in submodule latency                    
                    if f"{info['name']}_intra" in info["modules"]:
                        ret = info["modules"][f"{info['name']}_intra"]
                    else:
                        ret = "1"
                    if not info["valid"]:
                        ret = "0"
                else:
                    ret = "1"
            elif lat["type"] == "if":
                # Only examine the first child
                child = lat["child"][0]
                ret = extract_latency_expr(child, info)
            elif lat["type"] == "array_tile":      
                if info["module_attr"]["to_dram"] == 1 and info["module_attr"]["serialize"] == 0:
                    # Consider the DRAM latency here.
                    ret = "(" + f"{lat['size']}/{lat['last_dim']}*(20+{lat['last_dim']}/(512/8/{lat['ele_size']}))" + ")"
                else:
                    ret = "(" + lat["size"] + "/" + lat["data_pack_factor"] + ")"
            else:
                raise RuntimeError(f"Unsupported latency node type {lat['type']}")

            return ret

        # Latency prologue
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            for module in desp["latency"]:
                if desp["attr"][module]["in"] != 1:
                    continue
                if "inter" in module or "intra" in module:                    
                    # Keep all the latency AST under the mark.
                    info["valid"] = True
                    info["under_mark"] = None
                    info["in"] = 1
                else:
                    # Only keep the latency AST under the mark.
                    info["valid"] = False
                    info["under_mark"] = "array"
                    info["in"] = 1
                module_lat = desp["latency"][module]  
                info["name"] = module     
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue
            f.write(f"\t{module}_single_latency = ")                        
            f.write(info["modules"][module])
            f.write(f"\n")        
        f.write("\tlatency_prologue = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue            
            if not is_first:
                f.write(", ")
            f.write(f"{module}_single_latency")
            is_first = False
        f.write(")\n\n")

        # Latency epilogue
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            for module in desp["latency"]:
                if desp["attr"][module]["in"] != 0:
                    continue
                if "inter" in module or "intra" in module:
                    info["valid"] = True
                    info["under_mark"] = None
                    info["in"] = 0
                else:
                    info["valid"] = False
                    info["under_mark"] = "array"
                    info["in"] = 0
                module_lat = desp["latency"][module]  
                info["name"] = module                
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue
            f.write(f"\t{module}_single_latency = ")                        
            f.write(info["modules"][module])
            f.write(f"\n")        
        cnt = 0
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue    
            cnt += 1
        if cnt == 1:
            f.write("\tlatency_epilogue = ")
        else:
            f.write("\tlatency_epilogue = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue            
            if not is_first:
                f.write(", ")
            f.write(f"{module}_single_latency")
            is_first = False
        if cnt == 1:            
            f.write("\n\n")
        else:
            f.write(")\n\n")

        # Latency main
        info = {"has_for_child": 0, "name": None, "modules": {}}
        for i in range(2):
            # Run second time to fill in the incomplete expression            
            for module in desp["latency"]:
                module_lat = desp["latency"][module]  
                info["name"] = module
                info["valid"] = True
                info["under_mark"] = None
                info["in"] = -1
                info["module_attr"] = desp["attr"][module]
                info["modules"][module] = extract_latency_expr(module_lat, info)            
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue
            f.write(f"\t{module}_latency = ")                        
            f.write(info["modules"][module])
            f.write(f"\n")        
        f.write("\tlatency_main = max(")
        is_first = True
        for module in info["modules"]:
            if "inter" in module or "intra" in module:
                continue            
            if not is_first:
                f.write(", ")
            f.write(f"{module}_latency")
            is_first = False
        f.write(")\n\n")

        #f.write("\tprint(latency_prologue, latency_main, latency_epilogue)\n\n")

        f.write("\tlatency = latency_prologue + latency_main + latency_epilogue\n\n")
        
        f.write("\treturn latency\n")
        f.write("\n")

    def print_infer_params_func(self, f, desp):
        f.write("def infer_params(params):\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                continue
            if not is_first:
                f.write(", ")            
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                continue
            if not is_first:
                f.write(", ")            
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")

        for p in desp["params"]:
            if "tags" in p and "auto_infer" in p["tags"]:
                f.write(f"\t{p['name']}_choices = [n*{p['bounds'][0]} for n in range(1, {p['bounds'][1]}//{p['bounds'][0]}+1) if {p['bounds'][1]}%(n*{p['bounds'][0]})==0]\n")
                f.write(f"\tif len({p['name']}_choices) == 0:\n")
                f.write(f"\t\treturn None\n")
                f.write(f"\tparams[\"{p['name']}\"] = max({p['name']}_choices)\n")
        f.write("\n")                
        f.write("\treturn params\n\n")

    def print_random_sampling_func(self, f, desp):
        f.write("def random_sampling(params):\n")
        f.write(f"\tdef filter_non_power_of_two(x):\n")
        f.write(f"\t\tif np.log2(x) != int(np.log2(x)):\n")
        f.write(f"\t\t\treturn True\n")
        f.write(f"\t\treturn False\n\n")
        # Print the task params
        for p in self.params_config["external"]:
            f.write(f"\t{p} = params[\"{p}\"]\n")
        f.write("\twhile True:\n")
        params_to_process = []
        for param in self.params_config["tunable"]:
            params_to_process.append(self.params_config["tunable"][param])
        #while len(params_to_process) > 0:            
        while True:
            update = False
            for param in params_to_process:
                if "divisors" not in param: 
                    #print("first ", param["name"])                   
                    f.write(f"\t\tsample = random.randint(int({param['bounds'][0]}), int({param['bounds'][1]}))\n")
                    f.write(f"\t\t{param['name']} = sample\n")
                    f.write(f"\t\tparams[\"{param['name']}\"] = sample\n")
                    params_to_process.remove(param)
                    update = True
            if not update:
                break
        while len(params_to_process) > 0:            
            for param in params_to_process:                
                if "divisors" in param and param["divisors"] not in params_to_process:                    
                    #print("second ", param["name"])
                    if "tags" in param and "power_of_two" in param["tags"]:
                        f.write(f"\t\tsample = random.sample(utils.get_divisors(int({param['bounds'][1]}), filter_non_power_of_two), 1)[-1]\n")
                    else:
                        f.write(f"\t\tsample = random.sample(utils.get_divisors(int({param['bounds'][1]}), None), 1)[-1]\n")
                    f.write(f"\t\t{param['name']} = sample\n")
                    f.write(f"\t\tparams[\"{param['name']}\"] = sample\n")
                    params_to_process.remove(param)
        # Latency hiding
        if "PE" not in desp["memory"]:        
            f.write(f"\t\tbreak\n")
        else:
            f.write(f"\t\tlatency_factors = 1\n")
            for p, param in self.params_config["tunable"].items():
                if param["attr"] == "latency_tiling_factor":
                    f.write(f"\t\tlatency_factors *= {param['name']}\n")
                if param["attr"] == "SIMD_tiling_factor":
                    f.write(f"\t\tsimd_factor = {param['name']}\n")
            data_type = desp["memory"]["PE"]["ele_type"]
            if data_type == "float":
                f.write(f"\t\tif latency_factors >= 8 * simd_factor:\n")
                f.write(f"\t\t\tbreak\n")
            else:
                raise RuntimeError(f"Unsupported data type in random sample generation: {data_type}")
        f.write("\n")                
        f.write("\treturn params\n\n")        

    def print_bound_check_func(self, f, desp):
        f.write("def bound_check(params):\n")
        f.write(f"\tdef filter_non_power_of_two(x):\n")
        f.write(f"\t\tif np.log2(x) != int(np.log2(x)):\n")
        f.write(f"\t\t\treturn True\n")
        f.write(f"\t\treturn False\n\n")
        # Load parameters
        f.write("\t")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(p["name"])
            is_first = False
        f.write(" = ")
        is_first = True
        for p in desp["params"]:
            if not is_first:
                f.write(", ")
            f.write(f'params[\"{p["name"]}\"]')
            is_first = False
        f.write("\n\n")
        for p in desp["params"]:
            if "bounds" in p:
                f.write(f"\tif {p['name']} < {p['bounds'][0]}:\n")
                f.write(f"\t\treturn False\n")
                f.write(f"\tif {p['name']} > {p['bounds'][1]}:\n")
                f.write(f"\t\treturn False\n")
            if "tags" in p and "power_of_two" in p["tags"]:
                f.write(f"\tif filter_non_power_of_two({p['name']}):\n")
                f.write(f"\t\treturn False\n")
        # Latency hiding
        if "PE" in desp["memory"]:
            f.write(f"\tlatency_factors = 1\n")
            for p, param in self.params_config["tunable"].items():
                if param["attr"] == "latency_tiling_factor":
                    f.write(f"\tlatency_factors *= {param['name']}\n")
                if param["attr"] == "SIMD_tiling_factor":
                    f.write(f"\tsimd_factor = {param['name']}\n")
            data_type = desp["memory"]["PE"]["ele_type"]
            if data_type == "float":
                f.write(f"\tif latency_factors < 8 * simd_factor:\n")
                f.write(f"\t\treturn False\n")
            else:
                raise RuntimeError(f"Unsupported data type in random sample generation: {data_type}")
        
        f.write("\treturn True\n\n")        

    def register(self, desp, py_f):
        """ Register the design in the descriptor file
        Generate all the necessary functions for evaluating the performance of the 
        target design.         
        """        
        #print(desp["compute"])        
        with open(py_f, 'w') as f:
            f.write("from math import ceil\n")
            f.write("import numpy as np\n")
            f.write("import random\n")
            f.write("import utils\n\n")

            # Generate resource est func        
            self.print_resource_est_func(f, desp)

            # Generate latency est func
            self.print_latency_est_func(f, desp)

            # Tuning parameters
            #self.params_config = desp["params"]
            self.params_config = {"external": {}, "tunable": {}, "infer": {}}
            for param in desp["params"]:
                if param["tunable"]:
                    self.params_config["tunable"][param["name"]] = param
                else:
                    if "external" in param["tags"]:
                        self.params_config["external"][param["name"]] = param
                    elif "auto_infer" in param["tags"]:
                        self.params_config["infer"][param["name"]] = param
        
            # Generate infer parameter func
            self.print_infer_params_func(f, desp)

            # Generate the random sampling func
            self.print_random_sampling_func(f, desp)

            # Generate the bound check func
            self.print_bound_check_func(f, desp)

        sys.path.append(os.path.dirname(py_f))
        basename = os.path.basename(py_f).split(".")[0]        
        module = __import__(basename)
        self.est_resource_func = module.est_resource
        self.est_latency_func = module.est_latency
        self.infer_params_func = module.infer_params
        self.random_sampling_func = module.random_sampling
        self.bound_check_func = module.bound_check
        self.desp = desp

    def est_latency(self, params):
        if not self.est_latency_func:
            raise RuntimeError(f"Latency function for design {self.name} undefined")
        else:
            return self.est_latency_func(params)
    
    def est_resource(self, params):
        if not self.est_latency_func:
            raise RuntimeError(f"Resource function for design {self.name} undefined")
        else:
            return self.est_resource_func(params)

    def infer_params(self, params):
        if not self.infer_params_func:
            raise RuntimeError(f"Internal parameter inference function for design {self.name} undefined")
        else:
            return self.infer_params_func(params)

    def random_sampling(self, params):
        if not self.random_sampling_func:
            raise RuntimeError(f"Random sampling function for design {self.name} undefined")
        else:
            return self.random_sampling_func(params)

    def bound_check(self, params):
        if not self.bound_check_func:
            raise RuntimeError(f"Bound check function for design {self.name} undefined")
        else:
            return self.bound_check_func(params)            

================================================
FILE: autosa_scripts/tuner/main.py
================================================
import argparse
from datetime import datetime
import logging
import numpy as np
import os
import pickle
import concurrent.futures
import json
import pprint

from design import Design
from constraint import Constraint
from search_task import SearchTask
import utils
import tuner

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default="outdir", help="output directory")
    parser.add_argument('--db', type=str, default="db", help="search database")
    parser.add_argument('--objective', type=str, default="latency", help="optimization target")
    parser.add_argument('--cst', type=str, default="hw_cst", help="hardware constraint")
    parser.add_argument('--stop-after-epochs', type=int, default=-1, help="number of epochs of the unit searching task")
    parser.add_argument('--stop-after-time', type=int, default=-1, help="number of epochs of the unit searching task")
    parser.add_argument('--use-db', type=int, default=1, help="use database")
    parser.add_argument('--n-thread', type=int, default=16, help="number of threads to use for searching")
    parser.add_argument('--designs', type=str, default="designs", help="systolic array design directory")
    parser.add_argument('--task', type=str, default="mm", help="search task")

    args = parser.parse_args()
    
    search_obj = args.objective    
    
    # Set up the working directory
    now = datetime.now()
    outdir = args.outdir
    os.makedirs(outdir, exist_ok=True)    
    explore_config = ""
    exp_name = f"O_{args.objective}-C_{explore_config}-T_{now.date()}-{now.time()}"
    outdir = f"{outdir}/{exp_name}"
    os.makedirs(outdir, exist_ok=True)
    logger = utils.init_logger(outdir)

    # Load the constraints
    cst = Constraint(f'cst/{args.cst}.json')

    # Set up the searching algorithm stop criteria
    max_epochs = -1
    max_time = -1
    if args.stop_after_epochs > 0:
        max_epochs = args.stop_after_epochs
    elif args.stop_after_time > 0:
        max_time = args.stop_after_time
    else:
        max_time = 60

    # Set up the parallel executor    
    # TODO

    # Register designs    
    design_dir = args.designs
    os.makedirs(f"{design_dir}/register", exist_ok=True)
    designs = []
    for f in os.listdir(design_dir):
        if f.endswith(".json"):
            with open(f'{design_dir}/{f}', 'r') as json_f:
                desp = json.load(json_f)
            design = Design(f.split(".")[0])
            design.register(desp, f"{design_dir}/register/{design.name}.py")
            #print(design.name)
            designs.append(design)
    if len(designs) == 0:
        raise RuntimeError("No design found")        
    #exit(0)

    # Load task
    with open(f'task/{args.task}.json') as f:
        data = json.load(f)
    tasks = []
    for task in data["tasks"]:
        tasks.append(task)

    # Start searching
    counter = utils.PerfCounter(logger)
    counter.init_counter("Total Search Time")
    all_records = []        
    for task in tasks:
        search_record = utils.SearchRecord().reset()
        #for design in [designs[4]]:
        for design in designs:
            search_task = SearchTask(design, task)
            record = tuner.genetic_search(search_task, cst, search_obj, logger, max_epochs, max_time)
            all_records.append(record)
            search_record.update(record)
        task["search results"] = search_record

    counter.update_counter("Total Search Time")
    counter.print_counter("Total Search Time")

    print(all_records)

    # Display and dump the search history
    #for task in tasks:
    #    logger.info(pprint.pformat(task, indent=4))
    with open(f"{outdir}/results.log", 'w') as f:
        f.write(pprint.pformat(task, indent=4))
    with open(f"{outdir}/history.log", 'w') as f:
        f.write(pprint.pformat(all_records, indent=4))

================================================
FILE: autosa_scripts/tuner/search_task.py
================================================
import json
import random
import numpy as np
import bisect
#from sympy import *

import utils

class SearchTask(object):
    def __init__(self, design, task):
        self.design = design
        self.task = task        

    def adjust_params(self, params):
        """ Adjust the parameters based on its contraints.
        """
        def filter_non_power_of_two(x):
            if np.log2(x) != int(np.log2(x)):
                return True
            return False
        
        # Making all factors to be even numbers to have more divisors
        for p, param in self.design.params_config["tunable"].items():
            params[p] = int(np.ceil(params[p] / 2) * 2)        
        
        # Making all divisor factors to be divisors of the dependent variable
        for p, param in self.design.params_config["tunable"].items():
            #print(param)
            if "divisors" in param:
                if "tags" in param and "power_of_two" in param["tags"]:
                    choices = utils.get_divisors(params[param["divisors"][0]], filter_non_power_of_two)
                else:
                    choices = utils.get_divisors(params[param["divisors"][0]], None)
                idx = bisect.bisect(choices, params[p])
                if idx >= len(choices):
                    idx -= 1
                if idx > 1:
                    if abs(choices[idx - 1] - params[p]) < abs(choices[idx] - params[p]):
                        idx -= 1
                params[p] = choices[idx]

        return params

    def generate_random_sample(self):
        """ Generate a random sample in the design space.
        """
        task_params = {}
        for param in self.task["params"]:
            task_params[param] = self.task["params"][param]
        return self.design.random_sampling(task_params)        

    def evaluate(self, params, metric="latency"):        
        if metric == "latency":
            params = self.design.infer_params(params)                        
            if params:
                if not self.design.bound_check(params):
                    return 0, None
                latency = self.design.est_latency(params)
                resource = self.design.est_resource(params)
                if latency:
                    return 1 / latency, resource
                else:
                    return 0, None
            else:
                return 0, None
        else:                        
            raise RuntimeError(f"Not supported metric: {metric}")

================================================
FILE: autosa_scripts/tuner/task/cnn.json
================================================
{
  "tasks": [
    {
      "name": "conv",
      "params": {
        "o": 6,
        "i": 1,
        "r": 5,
        "c": 5,
        "p": 3,
        "q": 3
      }      
    }
  ]
}


================================================
FILE: autosa_scripts/tuner/task/mm.json
================================================
{
  "tasks": [
    {
      "name": "gemm1",
      "params": {
        "p0": 1024,
        "p1": 1024,
        "p2": 1024
      }      
    }
  ]
}


================================================
FILE: autosa_scripts/tuner/task/mm2.json
================================================
{
  "tasks": [
    {
      "name": "gemm1",
      "params": {
        "p0": 1024,
        "p1": 1024,
        "p2": 1024
      }      
    },
    {
      "name": "gemm2",
      "params": {
        "p0": 512,
        "p1": 512,
        "p2": 512
      }      
    }
  ]
}


================================================
FILE: autosa_scripts/tuner/tuner.py
================================================
import numpy as np

import utils
import random

class Tuner(object):
    def __init__(self, task, cst, obj, logger, max_epoch, max_time):
        self.task = task
        self.cst = cst
        self.obj = obj
        self.logger = logger
        self.max_epoch = max_epoch
        self.max_time = max_time
        self.best_reward = 0
        self.best_task_params = None
        self.best_search_record = utils.SearchRecord().reset()        

    def overuse_constraint(self, used_cst):
        if not used_cst:
            # If constraint doesn't exist, return True to exclude this design
            return True

        if used_cst['BRAM18K'] > self.cst.hw_cst['BRAM18K']:            
            return True
        if used_cst['DSP'] > self.cst.hw_cst['DSP']:            
            return True
        return False

class GeneticTuner(Tuner):
    def __init__(self, task, cst, obj, logger, max_epoch, max_time, params):
        super().__init__(task, cst, obj, logger, max_epoch, max_time)        
        self.params = params
        self.epoch = 0
        if max_epoch > 0:
            self.stop_criteria = "epoch"
            self.max_epoch = max_epoch
        else:
            self.stop_criteria = "time"
            self.max_time = max_time
        self.counter = utils.PerfCounter(self.logger)
        self.search_time = None
        self.param_idx_map = {}
        self.idx_param_map = {}

    def select_parents(self, population, fitness, num_parents):
        """ Select "num_parents" parents with the highest fitness score.
        """        
        fitness_idx_sorted = np.argsort(-fitness)        
        parents = population[fitness_idx_sorted[:num_parents]][:]
        return parents

    def crossover(self, pool, num_children):
        """ Perform single-point crossover.
        """
        children = np.empty((num_children, len(self.task.design.params_config["tunable"])))
        # Build the parameter dependecy chain
        param_deps = {}
        param_cnt = 0
        for p, param in self.task.design.params_config["tunable"].items():
            if "divisors" in param:
                param_deps[param["name"]] = param["divisors"][0]
                param_cnt += 2
        if param_cnt != len(self.task.design.params_config["tunable"]):
            raise RuntimeError("Not all tuning parameters can be handled by crossover")
        #print(param_deps)        
        for i in range(num_children):
            parents_idx = [i % pool.shape[0], np.random.randint(0, pool.shape[0])]
            #print(parents_idx)
            #print(pool[parents_idx[0]][:])
            #print(pool[parents_idx[1]][:])
            for param in param_deps:
                idx = np.random.randint(0, 2)
                #print(idx)
                children[i][self.param_idx_map[param]] = pool[parents_idx[idx]][self.param_idx_map[param]]
                children[i][self.param_idx_map[param_deps[param]]] = pool[parents_idx[idx]][self.param_idx_map[param_deps[param]]]
            #print(children[i][:])
            #exit(0)

        return children

    def mutation(self, pool):
        """ Perform mutation
        """
        for p_idx in range(pool.shape[0]):
            if random.random() < self.params["mutation_probability"]:
                if random.random() < self.params["epsilon"]:
                    task_params = self.task.generate_random_sample()
                    for i in range(pool.shape[1]):
                        pool[p_idx][i] = task_params[self.idx_param_map[i]]
                else:
                    idv = pool[p_idx][:]
                    task_params = {}                    
                    for p, param in self.task.design.params_config["tunable"].items():                
                        task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]
                    for p, param in self.task.design.params_config["external"].items():
                        task_params[param["name"]] = self.task.task["params"][param["name"]]
                    # Build the chains
                    # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                    split_chains = []
                    for p, param in self.task.design.params_config["external"].items():
                        chain = {"params": [param["name"]], "factors": []}
                        cur_param = param                                                
                        while "split_by" in cur_param:
                            #print(self.task.design.params_config["tunable"][cur_param["split_by"]])
                            if "divisors" in self.task.design.params_config["tunable"][cur_param["split_by"]] \
                                and cur_param["name"] in self.task.design.params_config["tunable"][cur_param["split_by"]]["divisors"]:
                                div = 1
                            else:
                                div = 0
                            chain["params"].append(cur_param["split_by"])
                            if div:
                                factor = np.ceil(task_params[cur_param["name"]] / task_params[cur_param["split_by"]])
                            else:
                                factor = task_params[cur_param["name"]] / task_params[cur_param["split_by"]]                            
                            chain["factors"].append(int(factor))                            
                            cur_param = self.task.design.params_config["tunable"][cur_param["split_by"]]                        
                        chain["factors"].append(int(task_params[cur_param["name"]]))
                        split_chains.append(chain)
                    
                    # Mutation
                    for chain in split_chains:
                        if len(chain["factors"]) <= 1:
                            continue
                        src_idx, dst_idx = random.sample(range(0, len(chain["factors"])), 2)
                        mutation_policy_probs = [0.2, 0, 0.8]
                        mutation_policy_probs = np.cumsum(mutation_policy_probs)
                        if random.random() < mutation_policy_probs[0]:
                            if chain["factors"][dst_idx] == 1:
                                continue
                            inc_stride = max(1, int(chain["factors"][src_idx] * random.random() * 1.0))
                            dec_stride = max(1, int(chain["factors"][dst_idx] - chain["factors"][src_idx] * chain["factors"][dst_idx] / (chain["factors"][src_idx] + inc_stride)))
                            chain["factors"][src_idx] += inc_stride                        
                            chain["factors"][dst_idx] -= dec_stride
                            chain["factors"][dst_idx] = max(1, chain["factors"][dst_idx])          
                        elif random.random() < mutation_policy_probs[1]:
                            pass
                        else:
                            factor = chain["factors"][src_idx]
                            if factor == 1:
                                continue
                            divs = utils.factorization(factor)
                            div = random.choice(divs)
                            chain["factors"][src_idx] /= div
                            chain["factors"][dst_idx] *= div

                    # Revert to the params
                    # [{"params": [p0, p3, p7], "factors": [ceil(p0/p3), p3/p7, p7]}, {}]
                    for chain in split_chains:
                        factor = chain["factors"][-1]
                        param = chain["params"][-1]                        
                        if param in self.param_idx_map:
                            pool[p_idx][self.param_idx_map[param]] = factor
                        for idx in range(len(chain["factors"]) - 2, -1, -1):
                            param = chain["params"][idx]
                            factor *= chain["factors"][idx]
                            if param in self.param_idx_map:
                                pool[p_idx][self.param_idx_map[param]] = factor
        
        return pool             

    def search(self):
        """ Search the design space using genetic algorithms.

        The algorithm is configured by several parameters.
        @ population_size: the number of trial solutions in each epoch.
        @ mutation_probability: the chance of each gene in each individual solution
        to be replaced by a random value.
        @ crossover_probability: the chance of an existed solution to pass its genome
        to new trial solutions.
        @ parents_ratio: the ratio of population filled by the members of the previous
        generation.
        """     
        self.counter.init_counter('Search Time')   
        if self.stop_criteria == "time":
            self.counter.init_counter('time')

        # Init the stats
        num_pop = int(self.params["population_size"])
        num_gen = int(self.max_epoch // num_pop)        
        num_parents = int(num_pop * self.params["parents_ratio"])
        self.logger.info(f'Number of generations: {num_gen}')
        self.logger.info(f'Number of population: {num_pop}')
        self.logger.info(f'Number of parents: {num_parents}')

        # Init the population
        population = np.empty((num_pop, len(self.task.design.params_config["tunable"])), dtype=int)
        if "ancestor" in self.params and self.params["ancestor"] != None:
            pass
        else:
            # Initialize the population randomly
            pop_cnt = 0
            while pop_cnt < num_pop:                
                task_params = self.task.generate_random_sample()
                param_arr = []
                for p, param in self.task.design.params_config["tunable"].items():                    
                    param_arr.append(task_params[param["name"]])
                population[pop_cnt] = np.array(param_arr, dtype=int)
                pop_cnt += 1                
        idx = 0
        for p, param in self.task.design.params_config["tunable"].items():
            self.param_idx_map[param["name"]] = idx
            self.idx_param_map[idx] = param["name"]
            idx += 1

        fitness = np.empty(num_pop, dtype=float)
        for i in range(num_pop):
            idv = population[i]
            task_params = {}
            for p, param in self.task.design.params_config["tunable"].items():
                task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]                    
            for p, param in self.task.design.params_config["external"].items():
                task_params[param["name"]] = self.task.task["params"][param["name"]]
            reward, used_constraint = self.task.evaluate(task_params, self.obj)
            if self.overuse_constraint(used_constraint):                
                reward = 0
            fitness[i] = reward

        while True:
            # Select the parents
            parents = self.select_parents(population, fitness, num_parents)
            # Crossover
            children = self.crossover(parents, num_pop - num_parents)
            # Mutation            
            children = self.mutation(children) 
            # Compose the new generation
            population[0:parents.shape[0], :] = parents
            population[parents.shape[0]:, :] = children      
            # Update the fitness
            for i in range(num_pop):
                idv = population[i]
                task_params = {}                
                for p, param in self.task.design.params_config["tunable"].items():
                    task_params[param["name"]] = idv[self.param_idx_map[param["name"]]]                    
                for p, param in self.task.design.params_config["external"].items():
                    task_params[param["name"]] = self.task.task["params"][param["name"]]
                #print(task_params)
                task_params = self.task.adjust_params(task_params)
                #if task_params["p3"] % task_params["p7"] != 0:
                #    print(task_params)
                #    exit(0)
                #print(task_params)                
                reward, used_constraint = self.task.evaluate(task_params, self.obj)
                if self.overuse_constraint(used_constraint):                
                    reward = 0
                fitness[i] = reward
                # Update the record
                if reward > self.best_reward:
                    self.best_reward = reward
                    self.best_cst = used_constraint
                    self.best_task_params = task_params
                    self.logger.info(f'Epoch {self.epoch}: new best reward: {self.best_reward} ({1/self.best_reward:.0f})')
                    self.best_search_record = utils.SearchRecord().extract_from_tuner(self)
            #exit(0)
            self.epoch += num_pop
            if self.stop_criteria == "epoch" and epoch > self.max_epoch:
                break
            if self.stop_criteria == "time":
                self.counter.update_counter('time')
                if self.counter.get_counter('time') > self.max_time:
                    break

        self.counter.update_counter('Search Time')   
        self.search_time = self.counter.get_counter('Search Time')
        return

def genetic_search(task, cst, obj, logger, max_epochs, max_time):
    tuner_params = {
        "population_size": 200,\
        "mutation_probability": 0.5,\
        "parents_ratio": 0.3,\
        "epsilon": 0.1,\
        "ancestor": None            
    }

    tuner = GeneticTuner(task, cst, obj, logger, max_epochs, max_time, tuner_params)
    tuner.search()
    search_record = utils.SearchRecord().extract_from_tuner(tuner)    

    return search_record

================================================
FILE: autosa_scripts/tuner/unit_test.py
================================================
import argparse
from datetime import datetime
import logging
import numpy as np
import os
import pickle
import concurrent.futures
import json
import pprint

from design import Design
from constraint import Constraint
from search_task import SearchTask
import utils
import tuner

if __name__ == "__main__":
    cst = Constraint(f'cst/hw_cst.json')
    max_epochs = -1
    max_time = 20
    search_obj = "latency"

    # Set up the working directory
    now = datetime.now()
    outdir = "outdir"
    os.makedirs(outdir, exist_ok=True)    
    explore_config = ""
    exp_name = f"O_{search_obj}-C_{explore_config}-T_{now.date()}-{now.time()}"
    outdir = f"{outdir}/{exp_name}"
    os.makedirs(outdir, exist_ok=True)
    logger = utils.init_logger(outdir)

    design_dir = "/curr/jaywang/research/autosa/AutoSA/autosa.tmp/output/tuning"
    designs = []
    for f in os.listdir(design_dir):
        if f.endswith(".json"):
            with open(f'{design_dir}/{f}', 'r') as json_f:
                desp = json.load(json_f)
            design = Design(f.split(".")[0])
            design.register(desp, f"{design_dir}/register/{design.name}.py")
            designs.append(design)
    if len(designs) == 0:
        raise RuntimeError("No design found")

    # Load task    
    with open(f'task/mm.json') as f:
        data = json.load(f)
    tasks = []
    for task in data["tasks"]:
        tasks.append(task)

    # Start searching
    for task in tasks:
        search_record = utils.SearchRecord().reset()
        #for design in designs:
        for design in [designs[0]]:
            print(design.name)
            search_task = SearchTask(design , task)
            #task_params = {
            #    "p0": 1024, "p1": 1024, "p2": 1024,
            #    "p3": 206, "p4": 172, "p5": 8,
            #    "p6": 86, "p7": 2, "p8": 8
            #}
            task_params = {
                "p0": 1024, "p1": 1024, "p2": 1024,
                "p3": 342, "p4": 56, "p5": 148,
                "p6": 19, "p7": 2, "p8": 8
            }
            # i j k 
            # i k j
            # i j k 
            reward, resource = search_task.evaluate(task_params)
            print(1/reward)
            print(resource)
            #search_record.update(tuner.genetic_search(search_task, cst, search_obj, logger, max_epochs, max_time))
        #task["search results"] = search_record

    #for task in tasks:
    #    logger.info(pprint.pformat(task, indent=4))

================================================
FILE: autosa_scripts/tuner/utils.py
================================================
import time
import functools
import math
import logging
import itertools
from datetime import datetime
from subprocess import Popen, PIPE
import json
import pprint
import concurrent.futures
import queue

def factorization(x):
    if x == 0:
        raise RuntimeError(f"Factorization of 0")
    prime_factors = []
    while x % 2 == 0:
        prime_factors.append(2)
        x = x / 2
    
    for i in range(3, int(math.sqrt(x)) + 1, 2):
        while x % i == 0:
            prime_factors.append(int(i))
            x = x / i
    
    if x > 2:
        prime_factors.append(int(x))

    return prime_factors

def get_divisors(x, filter=None):
    """ Return the divisors of the integer x
    Call the filter function to filter out the illegal one.
    """
    divisors = []
    large_divisors = []
    for i in range(1, int(math.sqrt(x) + 1)):
        if x % i == 0:
            if (filter and not filter(i)) or not filter:
                divisors.append(int(i))
            if i * i != x:
                if (filter and not filter(int(x / i))) or not filter:
                    large_divisors.append(int(x / i))
    for d in reversed(large_divisors):
        divisors.append(d)

    return divisors

class PerfCounter(object):
    def __init__(self, logger):
        self.logger = logger
        self.counters = {}
    
    def init_counter(self, name):        
        self.counters[name] = {'start': time.perf_counter(), 'elapsed': 0}
        
    def update_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        now = time.perf_counter()
        self.counters[name]['elapsed'] += (now - self.counters[name]['start'])
        self.counters[name]['start'] = now

    def get_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        return self.counters[name]['elapsed']

    def print_counter(self, name):
        if name not in self.counters:
            raise RuntimeError(f"Counter {name} is not defined")
        self.logger.info(f'[Event: {name}] Total elapsed time: {self.counters[name]["elapsed"]:.4f} s')

    def print_counters(self):
        for name in self.counters:
            self.logger.info(f'[Event: {name}] Total elapsed time: {self.counters[name]["elapsed"]:.4f} s')

def init_logger(outdir):	
    logger = logging.getLogger('AutoSA-Tuner')
    # If there is already any handlers, remove them	
    for handler in logger.handlers[:]:
        handler.close()
        logger.removeHandler(handler)
    formatter = logging.Formatter(
                '[%(name)s %(asctime)s] %(levelname)s: %(message)s',
                '%Y-%m-%d %H:%M:%S')
    logger.setLevel(logging.INFO)
    s_handler = logging.StreamHandler()    	
    f_handler = logging.FileHandler(f'{outdir}/tuning.log', 'a')
    s_handler.setLevel(level=logging.INFO)
    f_handler.setLevel(level=logging.INFO)    
    s_handler.setFormatter(formatter)
    f_handler.setFormatter(formatter)
    logger.addHandler(s_handler)
    logger.addHandler(f_handler)
    
    return logger       

class SearchRecord(object):
    def __init__(self, max=1):
        self.cst = None
        self.max = max
        if self.max == 1:
            self.reward = 0
        else:
            self.reward = float("inf")
        self.latency = 0
        self.dsp_eff = 0
        self.design = -1
        self.ops = 0
        self.task_params = {}
        self.task_name = None
        self.metric = None
        self.tuning_params = {}

    def reset(self):
        self.cst = None        
        if self.max == 1:
            self.reward = 0
        else:
            self.reward = float("inf")
        self.latency = 0
        self.dsp_eff = 0
        self.design = -1
        self.ops = 0
        self.task_params = {}
        self.task_name = None
        self.metric = None        

        return self

    def update(self, new_record):        
        if self.max != new_record.max:
            raise RuntimeError("Inconsistent search record configuration")
        status = False
        if self.max == 1:
            if new_record.reward > self.reward:				
                status = True
        else:
            if new_record.reward < self.reward:
                status = True
        if status:
            self.cst = new_record.cst
            self.reward = new_record.reward
            self.latency = new_record.latency
            self.dsp_eff = new_record.dsp_eff
            self.design = new_record.design            
            self.ops = new_record.ops
            self.task_params = new_record.task_params
            self.task_name = new_record.task_name            

    def extract_from_tuner(self, tuner):
        if tuner.best_task_params:
            self.cst = tuner.best_cst
            self.reward = tuner.best_reward
            if tuner.obj == "latency":
                self.latency = 1 / self.reward
            else:
                raise RuntimeError("Unsupported search objective")
            self.design = tuner.task.design.name
            self.task_params = tuner.best_task_params
            self.task_name = tuner.task.task["name"]            

        return self

    def __repr__(self):
        to_print = ""
        to_print += f"\nreward: {self.reward}"
        to_print += f"\ncst: {pprint.pformat(self.cst, indent=4)}"
        to_print += f"\nlatency: {self.latency}"
        to_print += f"\ndesign: {self.design}"
        to_print += f"\ntask_name: {self.task_name}"
        to_print += f"\ntask_params: \n{pprint.pformat(self.task_params, indent=4)}"
        to_print += "\n"

        return to_print

================================================
FILE: autosa_scripts/tuning_scripts/cnn.sh
================================================
#!/bin/bash

cd ../../
# <[i,r,c],o>
# <[o,r,c],i>
# <[o,i],[r,c]>

for loop_order in 1
do
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[1]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --explore-loop-permute --loop-permute-order=$loop_order    
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[6]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[7]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[8]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[9]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
done

for loop_order in 0 2
do
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[1]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[6]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[7]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --explore-loop-permute --loop-permute-order=$loop_order
    #./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[8]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=$loop_order
    ./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[9]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/cnn/param_names.json --simd-touch-space --explore-loop-permute --select-rar-dep="{kernel[]->__pet_ref_3[1]}" --loop-permute-order=$loop_order
done
cd -


================================================
FILE: autosa_scripts/tuning_scripts/gemm.sh
================================================
#!/bin/bash

cd ../../
# <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=2
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[1]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=2
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=2
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=2
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=2
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=2

# <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=0
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[1]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=0
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=0
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=0
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=0
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=0

# <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=1
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[1]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=1
#./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=1
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=1
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --explore-loop-permute --loop-permute-order=1
#./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=1
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=1
#./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --local-reduce --reduce-op="+" --simd-touch-space --explore-loop-permute --loop-permute-order=1
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[5]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --tuning-method=1 --param-names=./autosa_tests/mm/param_names.json --simd-touch-space --explore-loop-permute --loop-permute-order=1
cd -

================================================
FILE: autosa_scripts/tuning_scripts/model_validate.sh
================================================
# Dataflow [i] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls

# Dataflow [j] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[1];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls

# Dataflow [k] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction

# Dataflow [i,j] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls

# Dataflow [i,k] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction

# Dataflow [j,k] Permutation <[i,j],k>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[5];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction

#####################################################

# Dataflow [i] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=0

# Dataflow [j] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[1];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=0

# Dataflow [k] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=0

# Dataflow [i,j] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=0

# Dataflow [i,k] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=0

# Dataflow [j,k] Permutation <[i,k],j>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[5];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=0

#####################################################

# Dataflow [i] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=1

# Dataflow [j] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[1];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=1

# Dataflow [k] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=1

# Dataflow [i,j] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--explore-loop-permute \
--loop-permute-order=1

# Dataflow [i,k] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=1

# Dataflow [j,k] Permutation <[k,j],i>
./autosa ./autosa_tests/mm/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[5];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm/simd_info.json \
--host-serialize \
--hls \
--local-reduce \
--reduce-op="+" \
--simd-touch-space \
--array-contraction \
--explore-loop-permute \
--loop-permute-order=1

================================================
FILE: autosa_scripts/vitis_scripts/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_scripts/vitis_scripts/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/cnn/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/cnn/README.md
================================================
# Convolutional Neural Network (Single Layer, Small)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/cnn/kernel.c
autosa_tests/cnn/kernel.h
autosa_tests/cnn/simd_info.json
autosa_tests/cnn/Makefile
autosa_tests/cnn/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[8,8,4,8];kernel[]->latency[4,2,4];kernel[]->simd[1,1,1,2]}" --simd-info=./autosa_tests/cnn/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/cnn/Makefile autosa.tmp/output/
cp autosa_tests/cnn/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```


================================================
FILE: autosa_tests/cnn/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.cin:DDR[0]
sp=kernel0_1.w:DDR[1] 
sp=kernel0_1.cout:DDR[2]


================================================
FILE: autosa_tests/cnn/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/cnn/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv){
  data_t cin[R + K - 1][C + K - 1][I];
  data_t w[O][K][K][I];
  data_t cout[R][C][O];
  data_t cout_golden[R][C][O];

  // data initialization
  for (int i = 0 ; i < I; i++)
    for (int r = 0; r < R + K - 1; r++)
      for (int c = 0; c < C + K - 1; c++) {
        cin[r][c][i] = i;
      }

  for (int o = 0; o < O; o++)
    for (int i = 0; i < I; i++) 
      for (int p = 0; p < K; p++)
        for (int q = 0; q < K; q++) {
          w[o][p][q][i] = o;
        }
 
#pragma scop
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        //cout[r][c][o] = 0;
        for (int i = 0; i < I; i++)
          for (int p = 0; p < K; p++)
            for (int q = 0; q < K; q++) {
              cout[r][c][o] = cout[r][c][o] + cin[r + p][c + q][i] * w[o][p][q][i];
            }
      }
#pragma endscop  
 
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        cout_golden[r][c][o] = 0;
        for (int i = 0; i < I; i++)
          for (int p = 0; p < K; p++)
            for (int q = 0; q < K; q++) {
              cout_golden[r][c][o] = cout_golden[r][c][o] + cin[r + p][c + q][i] * w[o][p][q][i];
            }
      }

  int err = 0;
  float thres = 0.001;
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        if (fabs((float)cout_golden[r][c][o] - (float)cout[r][c][o]) > thres) {
          err++;
        }
      }

  //if (err) {
  //  printf("Test failed with %d errors!\n", err);
  //  return -1;
  //} else {
  //  printf("Test passed!\n");
  //  return 0;
  //}
}


================================================
FILE: autosa_tests/cnn/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
#define O 16
#define I 16
#define R 16
#define C 16
#define K 3

//#define O 6
//#define I 1
//#define R 5
//#define C 5
//#define K 3


================================================
FILE: autosa_tests/cnn/param_names.json
================================================
{
  "kernel0": ["q", "p", "o", "r", "c", "i"],
  "kernel1": ["q", "p", "o", "r", "c", "i"],
  "kernel2": ["q", "p", "o", "r", "c", "i"],
  "kernel3": ["q", "p", "o", "r", "c", "i"],
  "kernel4": ["q", "p", "o", "r", "c", "i"],
  "kernel5": ["q", "p", "o", "r", "c", "i"],
  "kernel6": ["q", "p", "o", "r", "c", "i"],
  "kernel7": ["q", "p", "o", "r", "c", "i"],
  "kernel8": ["q", "p", "o", "r", "c", "i"],
  "kernel9": ["q", "p", "o", "r", "c", "i"]
}


================================================
FILE: autosa_tests/cnn/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y", "y", "y"]
  },
  "kernel1": {
    "reduction": ["y", "y", "y"]
  },
  "kernel2": {
    "reduction": ["y", "y", "y"]
  },
  "kernel3": {
    "reduction": ["y", "y", "y"]
  },
  "kernel4": {
    "reduction": ["y", "y", "y"]
  },
  "kernel5": {
    "reduction": ["y", "y", "y"]
  },
  "kernel6": {
    "reduction": ["y", "y", "y"]
  },
  "kernel7": {
    "reduction": ["y", "y", "y"]
  },
  "kernel8": {
    "reduction": ["y", "y", "y"]
  },
  "kernel9": {
    "reduction": ["y", "y", "y"]
  }
}


================================================
FILE: autosa_tests/dnn_ops/dc_simd_info.json
================================================
{
  "kernel4": {
    "reduction": ["y", "y"]
  }
}


================================================
FILE: autosa_tests/dnn_ops/fc_simd_info.json
================================================
{
  "kernel2": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/dnn_ops/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/dnn_ops/kernel.c
================================================
// In this example, we compile three different operators that are found often in 
// DNNs, including: point-wise conv, depth-wise conv, and FC.

#include "kernel.h"

int main(int argc, char **argv){
#ifdef PC	
  // Point-wise CONV
  data_t pc_cin[PC_R + PC_K - 1][PC_C + PC_K - 1][PC_I];
  data_t pc_w[PC_O][PC_K][PC_K][PC_I];
  data_t pc_cout[PC_R][PC_C][PC_O];
  data_t pc_cout_golden[PC_R][PC_C][PC_O];

  for (int i = 0; i < PC_I; i++)
    for (int r = 0; r < PC_R + PC_K - 1; r++)
      for (int c = 0; c < PC_C + PC_K - 1; c++) {
        pc_cin[r][c][i] = i;
      }

	for (int o = 0; o < PC_O; o++)
		for (int i = 0; i < PC_I; i++)
			for (int p = 0; p < PC_K; p++)
				for (int q = 0; q < PC_K; q++) {
					pc_w[o][p][q][i] = o;
				}

#pragma scop
  for (int o = 0; o < PC_O; o++)
    for (int r = 0; r < PC_R; r++)
      for (int c = 0; c < PC_C; c++) {
        pc_cout[r][c][o] = 0;
        for (int i = 0; i < PC_I; i++)
          for (int p = 0; p < PC_K; p++)
            for (int q = 0; q < PC_K; q++) {
              pc_cout[r][c][o] = pc_cout[r][c][o] + pc_cin[r + p][c + q][i] * pc_w[o][p][q][i];
            }
      }	
#pragma endscop

  for (int o = 0; o < PC_O; o++)
    for (int r = 0; r < PC_R; r++)
      for (int c = 0; c < PC_C; c++) {
        pc_cout_golden[r][c][o] = 0;
        for (int i = 0; i < PC_I; i++)
          for (int p = 0; p < PC_K; p++)
            for (int q = 0; q < PC_K; q++) {
              pc_cout_golden[r][c][o] = pc_cout_golden[r][c][o] + pc_cin[r + p][c + q][i] * pc_w[o][p][q][i];
            }
      }

  int err = 0;
  float thres = 0.001;
  for (int o = 0; o < PC_O; o++)
    for (int r = 0; r < PC_R; r++)
      for (int c = 0; c < PC_C; c++) {
        if (fabs((float)pc_cout_golden[r][c][o] - (float)pc_cout[r][c][o]) > thres) {
          err++;
        }
      }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
#endif

#ifdef DC
  // Depth-wise CONV
  data_t dc_cin[DC_R + DC_K - 1][DC_C + DC_K - 1][DC_I];
  data_t dc_w[DC_K][DC_K][DC_I];
  data_t dc_cout[DC_R][DC_C][DC_O];
  data_t dc_cout_golden[DC_R][DC_C][DC_O];

  for (int i = 0; i < DC_I; i++)
    for (int r = 0; r < DC_R + DC_K - 1; r++)
      for (int c = 0; c < DC_C + DC_K - 1; c++) {
        dc_cin[r][c][i] = i;
      }
	
	for (int i = 0; i < DC_I; i++)
		for (int p = 0; p < DC_K; p++)
			for (int q = 0; q < DC_K; q++) {
				dc_w[p][q][i] = i;
			}

#pragma scop
  for (int o = 0; o < DC_O; o++)
    for (int r = 0; r < DC_R; r++)
      for (int c = 0; c < DC_C; c++) {
        dc_cout[r][c][o] = 0;        
        for (int p = 0; p < DC_K; p++)
          for (int q = 0; q < DC_K; q++) {
            dc_cout[r][c][o] = dc_cout[r][c][o] + dc_cin[r + p][c + q][o] * dc_w[p][q][o];
          }
      }	
#pragma endscop

  for (int o = 0; o < DC_O; o++)
    for (int r = 0; r < DC_R; r++)
      for (int c = 0; c < DC_C; c++) {
        dc_cout_golden[r][c][o] = 0;        
        for (int p = 0; p < DC_K; p++)
          for (int q = 0; q < DC_K; q++) {
            dc_cout_golden[r][c][o] = dc_cout_golden[r][c][o] + dc_cin[r + p][c + q][o] * dc_w[p][q][o];
          }
      }	

  int err = 0;
  float thres = 0.001;
  for (int o = 0; o < DC_O; o++)
    for (int r = 0; r < DC_R; r++)
      for (int c = 0; c < DC_C; c++) {
        if (fabs((float)dc_cout_golden[r][c][o] - (float)dc_cout[r][c][o]) > thres) {
          err++;
					printf("(golden, hw)@(%d, %d, %d): (%f, %f)\n", o, r, c, (float)dc_cout_golden[r][c][o], (float)dc_cout[r][c][o]);
        }
      }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
#endif

#ifdef FC
  // Fully-connected Layers
  data_t fc_cin[FC_I][FC_J];
  data_t fc_w[FC_J];
  data_t fc_cout[FC_I];
  data_t fc_cout_golden[FC_I];

  for (int i = 0; i < FC_I; i++)
    for (int j = 0; j < FC_J; j++) {
      fc_cin[i][j] = i;
    }
	
	for (int j = 0; j < FC_J; j++) {
		fc_w[j] = j;
	}

#pragma scop
  for (int i = 0; i < FC_I; i++) {
		fc_cout[i] = 0;       
    for (int j = 0; j < FC_J; j++) {
      fc_cout[i] = fc_cout[i] + fc_cin[i][j] * fc_w[j];
    }
  }
#pragma endscop

  for (int i = 0; i < FC_I; i++) {
		fc_cout_golden[i] = 0;       
    for (int j = 0; j < FC_J; j++) {
      fc_cout_golden[i] = fc_cout_golden[i] + fc_cin[i][j] * fc_w[j];
    }
  }	

  int err = 0;
  float thres = 0.001;
  for (int i = 0; i < FC_I; i++)    
    if (fabs((float)fc_cout_golden[i] - (float)fc_cout[i]) > thres) {
      err++;
			printf("(golden, hw)@(%d): (%f, %f)\n", i, (float)fc_cout_golden[i], (float)fc_cout[i]);
    }    

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
#endif
}

================================================
FILE: autosa_tests/dnn_ops/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

//#define PC
//#define DC
#define FC

typedef float data_t;
// point-wise conv
#define PC_O 16
#define PC_I 16
#define PC_R 8
#define PC_C 8
#define PC_K 3

// depth-wise conv
#define DC_O 16
#define DC_I 16
#define DC_R 8
#define DC_C 8
#define DC_K 3

// fc
#define FC_I 16
#define FC_J 16


================================================
FILE: autosa_tests/dnn_ops/pc_simd_info.json
================================================
{
  "kernel4": {
    "reduction": ["y", "y", "y"]
  },
  "kernel5": {
    "reduction": ["y", "y", "y"]
  } 
}


================================================
FILE: autosa_tests/large/cnn/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/cnn/README.md
================================================
# Convolutional Neural Network (Single Layer, Large)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/cnn/kernel.c
autosa_tests/large/cnn/kernel.h
autosa_tests/large/cnn/simd_info.json
autosa_tests/large/cnn/Makefile
autosa_tests/large/cnn/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/cnn/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[64,56,14,64];kernel[]->latency[4,4,7];kernel[]->simd[1,1,8]}" --simd-info=./autosa_tests/large/cnn/simd_info.json
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/cnn/Makefile autosa.tmp/output/
cp autosa_tests/large/cnn/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/cnn/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.cin:DDR[0]
sp=kernel0_1.w:DDR[1] 
sp=kernel0_1.cout:DDR[3]


================================================
FILE: autosa_tests/large/cnn/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/large/cnn/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv){
  // declarations
//  data_t cin[I][R + K - 1][C + K - 1];
//  data_t w[O][I][K][K];
//  data_t cout[O][R][C];
//  data_t cout_golden[O][R][C];
  static data_t cin[R + K - 1][C + K - 1][I];
  static data_t w[O][K][K][I];
  static data_t cout[R][C][O];
  static data_t cout_golden[R][C][O];

  // data initialization
  for (int i = 0 ; i < I; i++)
    for (int r = 0; r < R + K - 1; r++)
      for (int c = 0; c < C + K - 1; c++) {
        cin[r][c][i] = 1;
      }

  for (int o = 0; o < O; o++)
    for (int i = 0; i < I; i++) 
      for (int p = 0; p < K; p++)
        for (int q = 0; q < K; q++) {
          w[o][p][q][i] = 1;
        }
 
#pragma scop
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        cout[r][c][o] = 0;
        for (int i = 0; i < I; i++)
          for (int p = 0; p < 3; p++)
            for (int q = 0; q < 3; q++) {
              cout[r][c][o] = cout[r][c][o] + cin[r + p][c + q][i] * w[o][p][q][i];
            }
      }
#pragma endscop  
 
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        cout_golden[r][c][o] = 0;
        for (int i = 0; i < I; i++)
          for (int p = 0; p < 3; p++)
            for (int q = 0; q < 3; q++) {
              cout_golden[r][c][o] = cout_golden[r][c][o] + cin[r + p][c + q][i] * w[o][p][q][i];
            }
      }

  int err = 0;
  float thres = 0.001;
  for (int o = 0; o < O; o++)
    for (int r = 0; r < R; r++)
      for (int c = 0; c < C; c++) {
        if (fabs((float)cout_golden[r][c][o] - (float)cout[r][c][o]) > thres) {
          err++;
        }
      }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
}


================================================
FILE: autosa_tests/large/cnn/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
//#define O 512
#define O 640
#define I 512
//#define R 60
#define R 56
#define C 56
#define K 3

//#define O 264
//#define I 256
//#define R 224
//#define C 224
//#define K 5


================================================
FILE: autosa_tests/large/cnn/simd_info.json
================================================
{
  "kernel4": {
    "reduction": ["y", "y", "y"]
  },
  "kernel5": {
    "reduction": ["y", "y", "y"]
  } 
}


================================================
FILE: autosa_tests/large/cnn/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/cnn/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_examples/cnn_large_ab/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 3
DDR_loc_2d_y['cin_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['cin_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_cin_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_cin_m_axi_U'] = 0

DDR_loc_2d_y['w_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['w_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_w_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_w_m_axi_U'] = 0

DDR_loc_2d_y['cout_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['cout_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_cout_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_cout_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_x['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_y['kernel0_entry12_U0'] = 1
DDR_loc_2d_x['kernel0_entry12_U0'] = 1

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 0, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
#max_usage_ratio_2d = [ [0.9, 0.85], [0.9, 0.85], [0.9, 0.85], [0.9, 0.85] ]
max_usage_ratio_2d = [ [0.9, 0.82], [0.9, 0.82], [0.9, 0.82], [0.9, 0.82] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = '/home/jaywang/doc_examples/cnn_large_ab/autobridge_v4'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/cnn/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/cnn/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
#STRATEGY="Default" 
STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=cin
ARG_FOR_DDR_2=w
#ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"
ARG_FOR_DDR_4=cout

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/large/mm/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 300 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/mm/README.md
================================================
# Matrix Multiplication (Large)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/mm/kernel.c
autosa_tests/large/mm/kernel.h
autosa_tests/large/mm/simd_info.json
autosa_tests/large/mm/Makefile
autosa_tests/large/mm/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[260,256,512];kernel[]->latency[20,16];kernel[]->simd[8]}" --simd-info=./autosa_tests/large/mm/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/mm/Makefile autosa.tmp/output/
cp autosa_tests/large/mm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/mm/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[3]


================================================
FILE: autosa_tests/large/mm/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/large/mm/kernel.c
================================================
#include "kernel.h"

//#define LAYOUT1
#define LAYOUT2
//#define LAYOUT3

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
#ifdef LAYOUT2  
  static data_t A[I][K], B[J][K], C[I][J], C_golden[I][J]; // gemm0,3
#endif  
#ifdef LAYOUT3  
  static data_t A[K][I], B[K][J], C[I][J], C_golden[I][J]; // gemm4
#endif  

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
#ifdef LAYOUT2      
      A[i][k] = (data_t)rand() / RAND_MAX;
#endif
#ifdef LAYOUT3      
      A[k][i] = (data_t)rand() / RAND_MAX;
#endif      
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
#ifdef LAYOUT2      
      B[j][k] = (data_t)rand() / RAND_MAX;
#endif
#ifdef LAYOUT3      
      B[k][j] = (data_t)rand() / RAND_MAX;
#endif      
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifdef LAYOUT2        
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
#endif
#ifdef LAYOUT3      
        C[i][j] = C[i][j] + A[k][i] * B[k][j];
#endif        
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifdef LAYOUT2        
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
#endif
#ifdef LAYOUT3        
        C_golden[i][j] = C_golden[i][j] + A[k][i] * B[k][j];
#endif        
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/large/mm/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

//typedef float data_t;
typedef int data_t;
//#define I 1024
//#define J 1024
//#define K 1024

//#define I 1040
//#define J 1024
//#define K 1024

#define I 208
#define J 512
#define K 256

//#define I 1032
//#define J 1024
//#define K 1024

//#define I 1024
//#define J 1032
//#define K 1024

//#define I 1024
//#define J 1024
//#define K 1032

//#define I 1060
//#define J 1024
//#define K 1024

//#define I 1040
//#define J 1024
//#define K 1024

//#define I 1024
//#define J 1056
//#define K 1080


================================================
FILE: autosa_tests/large/mm/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  },
  "kernel5": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/mm/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/mm/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_ab/use/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 3
DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

DDR_loc_2d_y['C_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['C_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 0

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 0, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
max_usage_ratio_2d = [ [0.8, 0.7], [0.85, 0.75], [0.85, 0.85], [0.85, 0.7] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = 'autobridge_prj'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/mm/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/mm/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
STRATEGY="Default" 
#STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=A
ARG_FOR_DDR_2=B
#ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"
ARG_FOR_DDR_4=C

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/large/mm_block_sparse/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/mm_block_sparse/README.md
================================================
# Matrix Multiplication with Block Sparsity (Large)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/mm_block_sparse/kernel.c
autosa_tests/large/mm_block_sparse/kernel.h
autosa_tests/large/mm_block_sparse/simd_info.json
autosa_tests/large/mm_block_sparse/Makefile
autosa_tests/large/mm_block_sparse/connectivity.cfg
autosa_tests/large/mm_block_sparse/hls_script.tcl
```

__Command__:
To run the HLS flow for C/RTL simulation
```bash
./autosa ./autosa_tests/large/mm_block_sparse/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,512];kernel[]->latency[32,32];kernel[]->simd[8]}" --simd-info=./autosa_tests/large/mm_block_sparse/simd_info.json --host-serialize --hls --block-sparse --block-sparse-ratio="{kernel[]->A[4,8]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `hls_script.tcl` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/hls_script.tcl autosa.tmp/output/
```

Run the TCL script to build the HLS project.

```
cd autosa.tmp/output
vivado_hls -f hls_script.tcl
```

Alternatively, if you need to generate the bitstream for on-board testing, simply remove the `--hls` flag from the AutoSA command.
```bash
./autosa ./autosa_tests/large/mm_block_sparse/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,512];kernel[]->latency[32,32];kernel[]->simd[8]}" --simd-info=./autosa_tests/mm_block_sparse/simd_info.json --host-serialize --block-sparse --block-sparse-ratio="{kernel[]->A[4,8]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/Makefile autosa.tmp/output/
cp autosa_tests/mm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
make check
```

================================================
FILE: autosa_tests/large/mm_block_sparse/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/large/mm_block_sparse/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/large/mm_block_sparse/kernel.c
================================================
/* This example uses the block sparsity to compute a matrix multiplication.
 * C = A * B
 * The matrix A is with block sparsity and the matrix B is dense.
 * For matrix A, every VEC_LEN elements are grouped into a vector.
 * Inside each vector, there are NUM_NZERO non-zero elements.
 * The sparsity of the matrix A is computed as 1 - NUM_NZERO / VEC_LEN.
 * To store the sparse matrix A, we use two data structs,
 * A_d for storing the non-zero elements and A_i for storing the offset of non-zero elements in each vector.
 * As an example, for matrix A of size I * K, where I = K = 8,
 * suppose that we have VEC_LEN = 4 and NUM_NZERO = 2, we denote the compression ratio
 * COMPRESS_RATIO = VEC_LEN / NUM_NZERO
 * then, we will have A_d[I][K / COMPRESS_RATIO],
 * for A_i, we use a char to store the mask of non-zero elements.
 * For example, if the vector is 0 1 0 2, we will have a mask 0101_0000 to store the 
 * offsets of non-zero elements.
 * Currently, we assume the vector length is a power of two and is no greater than 8.
 * If it is grater than 8, we could use a larger-width data type to store the offset accordingly.
 * Based on the analysis above, we will have the index matrix A_i as
 * char A_i[I][K / VEC_LEN].
 * In summary, we use A_d[I][K / COMPRESS_RATIO] and A_i[I][K / VEC_LEN] to represent the sparse matrix.
 */
#include "kernel.h"

int main(int argc, char **argv) {
  static data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];
  static data_t A_d[I][K / COMPRESS_RATIO];
  static unsigned char A_i[I][K / VEC_LEN];
  static data_t A_s[I][K / EFF_COMPRESS_RATIO];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = (data_t)rand() / RAND_MAX;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = (data_t)rand() / RAND_MAX;
    }

  for (int i = 0; i < I; i++)
    for (int k = 0; k < K / VEC_LEN; k++) {
      unsigned char offset = 0;
      int n = 0;
      while (n < NON_ZERO_NUM) {      
        int pos = rand() % VEC_LEN;
        /* Check if this position is already inserted */        
        unsigned char cur_mask = offset & (1 << pos);
        if (cur_mask) {
          continue;
        }
        offset = offset | (1 << pos);
        n++;
      }
      A_i[i][k] = offset;

      int pos = 0;
      int non_zero_pos = 0;
      while (pos < VEC_LEN) {
        unsigned char cur_mask = offset & (1 << pos);
        if (cur_mask) {
          A_d[i][k * NON_ZERO_NUM + non_zero_pos] = A[i][k * VEC_LEN + pos];
          non_zero_pos++;
        }
        pos++;
      }      
    }

  for (int i = 0; i < I; i++)
    for (int k = 0; k < K / VEC_LEN; k++) {
      int n;
      for (n = 0; n < NON_ZERO_NUM; n++) {
        A_s[i][k * (NON_ZERO_NUM + META_DATA_NUM) + n] = A_d[i][k * NON_ZERO_NUM + n];
      }
      unsigned char offset = A_i[i][k];
      union {data_t d; unsigned char c;} u;
      u.c = offset;
      A_s[i][k * (NON_ZERO_NUM + META_DATA_NUM) + n] = u.d;
    }

  /* For polyheral analysis */
#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

//  /* The actual computation */
//  for (int i = 0; i < I; i++)  
//    for (int j = 0; j < J; j++) {
//      C[i][j] = 0;
//      for (int k = 0; k < K / VEC_LEN; k++) {
//        /* Extract the non zero offset */
//        int offset[NON_ZERO_NUM];
//        unsigned char mask = A_i[i][k];
//        int pos = 0;
//        int non_zero_pos = 0;
//        while (pos < VEC_LEN) {
//          unsigned char cur_mask = mask & (1 << pos);
//          if (cur_mask) {
//            offset[non_zero_pos] = pos;
//            non_zero_pos++;
//          }
//          pos++;
//        }
//        for (int n = 0; n < NON_ZERO_NUM; n++) {
//          C[i][j] += A_d[i][k * NON_ZERO_NUM + n] * B[j][k * VEC_LEN + offset[n]];
//        }
//      }
//    }

  for (int i = 0; i < I; i++)  
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K / VEC_LEN; k++) {
        /* Extract the non zero offset */
        int offset[NON_ZERO_NUM];
        unsigned char mask = A_i[i][k];
        int pos = 0;
        int non_zero_pos = 0;
        while (pos < VEC_LEN) {
          unsigned char cur_mask = mask & (1 << pos);
          if (cur_mask) {
            offset[non_zero_pos] = pos;
            non_zero_pos++;
          }
          pos++;
        }
        for (int n = 0; n < NON_ZERO_NUM; n++) {
          C_golden[i][j] += A_d[i][k * NON_ZERO_NUM + n] * B[j][k * VEC_LEN + offset[n]];
        }
      }
    }  

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/large/mm_block_sparse/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 1024
#define J 1024
#define K 1024

// Sparsity [3:4]
//#define VEC_LEN 4
//#define NON_ZERO_NUM 3
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

// Sparsity [2:4]
//#define VEC_LEN 4
//#define NON_ZERO_NUM 2
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 2
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

// Sparsity [1:4]
//#define VEC_LEN 4
//#define NON_ZERO_NUM 1
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

// Sparsity [4:8]
#define VEC_LEN 8
#define NON_ZERO_NUM 4
#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
#define META_DATA_NUM 4
#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

// Sparsity [3:8]
//#define VEC_LEN 8
//#define NON_ZERO_NUM 3
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

// Sparsity [2:8]
//#define VEC_LEN 8
//#define NON_ZERO_NUM 2
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 2
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

================================================
FILE: autosa_tests/large/mm_block_sparse/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/mm_int16/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/mm_int16/README.md
================================================
# Matrix Multiplication in int16 (Large)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/mm_int16/kernel.c
autosa_tests/large/mm_int16/kernel.h
autosa_tests/large/mm_int16/simd_info.json
autosa_tests/large/mm_int16/Makefile
autosa_tests/large/mm_int16/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/mm_int16/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,32];kernel[]->latency[16,16];kernel[]->simd[32]}" --simd-info=./autosa_tests/large/mm_int16/simd_info.json --host-serialize --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/mm_int16/Makefile autosa.tmp/output/
cp autosa_tests/large/mm_int16/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/mm_int16/code.c
================================================
unsigned short mul_4_0_0 = local_A[0][0] * local_B[0][0];
unsigned short add_4_0 = mul_4_0_0 + local_A[0][1] * local_B[0][1];
unsigned short mul_4_1_0 = local_A[0][2] * local_B[0][2];
unsigned short add_4_1 = mul_4_1_0 + local_A[0][3] * local_B[0][3];
unsigned short mul_4_2_0 = local_A[0][4] * local_B[0][4];
unsigned short add_4_2 = mul_4_2_0 + local_A[0][5] * local_B[0][5];
unsigned short mul_4_3_0 = local_A[0][6] * local_B[0][6];
unsigned short add_4_3 = mul_4_3_0 + local_A[0][7] * local_B[0][7];
unsigned short mul_4_4_0 = local_A[0][8] * local_B[0][8];
unsigned short add_4_4 = mul_4_4_0 + local_A[0][9] * local_B[0][9];
unsigned short mul_4_5_0 = local_A[0][10] * local_B[0][10];
unsigned short add_4_5 = mul_4_5_0 + local_A[0][11] * local_B[0][11];
unsigned short mul_4_6_0 = local_A[0][12] * local_B[0][12];
unsigned short add_4_6 = mul_4_6_0 + local_A[0][13] * local_B[0][13];
unsigned short mul_4_7_0 = local_A[0][14] * local_B[0][14];
unsigned short add_4_7 = mul_4_7_0 + local_A[0][15] * local_B[0][15];
unsigned short mul_4_8_0 = local_A[0][16] * local_B[0][16];
unsigned short add_4_8 = mul_4_8_0 + local_A[0][17] * local_B[0][17];
unsigned short mul_4_9_0 = local_A[0][18] * local_B[0][18];
unsigned short add_4_9 = mul_4_9_0 + local_A[0][19] * local_B[0][19];
unsigned short mul_4_10_0 = local_A[0][20] * local_B[0][20];
unsigned short add_4_10 = mul_4_10_0 + local_A[0][21] * local_B[0][21];
unsigned short mul_4_11_0 = local_A[0][22] * local_B[0][22];
unsigned short add_4_11 = mul_4_11_0 + local_A[0][23] * local_B[0][23];
unsigned short mul_4_12_0 = local_A[0][24] * local_B[0][24];
unsigned short add_4_12 = mul_4_12_0 + local_A[0][25] * local_B[0][25];
unsigned short mul_4_13_0 = local_A[0][26] * local_B[0][26];
unsigned short add_4_13 = mul_4_13_0 + local_A[0][27] * local_B[0][27];
unsigned short mul_4_14_0 = local_A[0][28] * local_B[0][28];
unsigned short add_4_14 = mul_4_14_0 + local_A[0][29] * local_B[0][29];
unsigned short mul_4_15_0 = local_A[0][30] * local_B[0][30];
unsigned short add_4_15 = mul_4_15_0 + local_A[0][31] * local_B[0][31];
unsigned short add_3_0 = add_4_0 + add_4_1;
unsigned short add_3_1 = add_4_2 + add_4_3;
unsigned short add_3_2 = add_4_4 + add_4_5;
unsigned short add_3_3 = add_4_6 + add_4_7;
unsigned short add_3_4 = add_4_8 + add_4_9;
unsigned short add_3_5 = add_4_10 + add_4_11;
unsigned short add_3_6 = add_4_12 + add_4_13;
unsigned short add_3_7 = add_4_14 + add_4_15;
unsigned short add_2_0 = add_3_0 + add_3_1;
unsigned short add_2_1 = add_3_2 + add_3_3;
unsigned short add_2_2 = add_3_4 + add_3_5;
unsigned short add_2_3 = add_3_6 + add_3_7;
unsigned short add_1_0 = add_2_0 + add_2_1;
unsigned short add_1_1 = add_2_2 + add_2_3;
unsigned short add_0_0 = add_1_0 + add_1_1;
local_C[c7][c6] += add_0_0;


================================================
FILE: autosa_tests/large/mm_int16/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[3]


================================================
FILE: autosa_tests/large/mm_int16/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/large/mm_int16/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  static data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = rand() % 100;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = rand() % 100;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (abs(C_golden[i][j] - C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/large/mm_int16/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef unsigned short data_t;
#define I 1024
#define J 1024
#define K 1024 

================================================
FILE: autosa_tests/large/mm_int16/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel3": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/mm_int16/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/mm_int16/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_examples/mm_int16_ab/kernel0' # path to your hls project
#project_path = '/home/jaywang/doc_examples/mm_ab/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 3
DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

DDR_loc_2d_y['C_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['C_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 0

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 0, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
max_usage_ratio_2d = [ [0.85, 0.7], [0.85, 0.7], [0.85, 0.85], [0.85, 0.7] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = '/home/jaywang/doc_examples/mm_int16_ab/autobridge'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/mm_int16/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/mm_int16/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
#STRATEGY="Default" 
STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=A
ARG_FOR_DDR_2=B
#ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"
ARG_FOR_DDR_4=C

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/large/mm_int16/unroll.py
================================================
import math

# Modify the parameters here
UNROLL_FACTOR = 32
DATA_T = 'unsigned short'

# Generate the code
data_type = DATA_T
level = int(math.log2(UNROLL_FACTOR))
for layer in range(level - 1, -1, -1):
    pair = int(math.pow(2, layer))
    for i in range(pair):
        # data_t tmp_[layer]_[pair] = tmp_[layer+1]_[pair*2]_[pair*2+1]
        if layer == level - 1:
            print(f'{data_type} mul_{layer}_{i}_0 = local_A[0][{i*2}] * local_B[0][{i*2}];')
            print(f'{data_type} add_{layer}_{i} = mul_{layer}_{i}_0 + local_A[0][{i*2+1}] * local_B[0][{i*2+1}];')
        else:
            print(f'{data_type} add_{layer}_{i} = add_{layer+1}_{i*2} + add_{layer+1}_{i*2+1};')
print('local_C[c7][c6] += add_0_0;')


================================================
FILE: autosa_tests/large/mm_int8/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/mm_int8/README.md
================================================
# Matrix Multiplication in int8 (Large)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/mm_int8/kernel.c
autosa_tests/large/mm_int8/kernel.h
autosa_tests/large/mm_int8/simd_info.json
autosa_tests/large/mm_int8/Makefile
autosa_tests/large/mm_int8/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/mm_int8/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[264,256,64];kernel[]->latency[11,32];kernel[]->simd[64]}" --simd-info=./autosa_tests/large/mm_int8/simd_info.json --host-serialize --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/mm_int8/Makefile autosa.tmp/output/
cp autosa_tests/large/mm_int8/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/mm_int8/code.c
================================================
char mul_5_0_0 = local_A[0][0] * local_B[0][0];
char add_5_0 = mul_5_0_0 + local_A[0][1] * local_B[0][1];
char mul_5_1_0 = local_A[0][2] * local_B[0][2];
char add_5_1 = mul_5_1_0 + local_A[0][3] * local_B[0][3];
char mul_5_2_0 = local_A[0][4] * local_B[0][4];
char add_5_2 = mul_5_2_0 + local_A[0][5] * local_B[0][5];
char mul_5_3_0 = local_A[0][6] * local_B[0][6];
char add_5_3 = mul_5_3_0 + local_A[0][7] * local_B[0][7];
char mul_5_4_0 = local_A[0][8] * local_B[0][8];
char add_5_4 = mul_5_4_0 + local_A[0][9] * local_B[0][9];
char mul_5_5_0 = local_A[0][10] * local_B[0][10];
char add_5_5 = mul_5_5_0 + local_A[0][11] * local_B[0][11];
char mul_5_6_0 = local_A[0][12] * local_B[0][12];
char add_5_6 = mul_5_6_0 + local_A[0][13] * local_B[0][13];
char mul_5_7_0 = local_A[0][14] * local_B[0][14];
char add_5_7 = mul_5_7_0 + local_A[0][15] * local_B[0][15];
char mul_5_8_0 = local_A[0][16] * local_B[0][16];
char add_5_8 = mul_5_8_0 + local_A[0][17] * local_B[0][17];
char mul_5_9_0 = local_A[0][18] * local_B[0][18];
char add_5_9 = mul_5_9_0 + local_A[0][19] * local_B[0][19];
char mul_5_10_0 = local_A[0][20] * local_B[0][20];
char add_5_10 = mul_5_10_0 + local_A[0][21] * local_B[0][21];
char mul_5_11_0 = local_A[0][22] * local_B[0][22];
char add_5_11 = mul_5_11_0 + local_A[0][23] * local_B[0][23];
char mul_5_12_0 = local_A[0][24] * local_B[0][24];
char add_5_12 = mul_5_12_0 + local_A[0][25] * local_B[0][25];
char mul_5_13_0 = local_A[0][26] * local_B[0][26];
char add_5_13 = mul_5_13_0 + local_A[0][27] * local_B[0][27];
char mul_5_14_0 = local_A[0][28] * local_B[0][28];
char add_5_14 = mul_5_14_0 + local_A[0][29] * local_B[0][29];
char mul_5_15_0 = local_A[0][30] * local_B[0][30];
char add_5_15 = mul_5_15_0 + local_A[0][31] * local_B[0][31];
char mul_5_16_0 = local_A[0][32] * local_B[0][32];
char add_5_16 = mul_5_16_0 + local_A[0][33] * local_B[0][33];
char mul_5_17_0 = local_A[0][34] * local_B[0][34];
char add_5_17 = mul_5_17_0 + local_A[0][35] * local_B[0][35];
char mul_5_18_0 = local_A[0][36] * local_B[0][36];
char add_5_18 = mul_5_18_0 + local_A[0][37] * local_B[0][37];
char mul_5_19_0 = local_A[0][38] * local_B[0][38];
char add_5_19 = mul_5_19_0 + local_A[0][39] * local_B[0][39];
char mul_5_20_0 = local_A[0][40] * local_B[0][40];
char add_5_20 = mul_5_20_0 + local_A[0][41] * local_B[0][41];
char mul_5_21_0 = local_A[0][42] * local_B[0][42];
char add_5_21 = mul_5_21_0 + local_A[0][43] * local_B[0][43];
char mul_5_22_0 = local_A[0][44] * local_B[0][44];
char add_5_22 = mul_5_22_0 + local_A[0][45] * local_B[0][45];
char mul_5_23_0 = local_A[0][46] * local_B[0][46];
char add_5_23 = mul_5_23_0 + local_A[0][47] * local_B[0][47];
char mul_5_24_0 = local_A[0][48] * local_B[0][48];
char add_5_24 = mul_5_24_0 + local_A[0][49] * local_B[0][49];
char mul_5_25_0 = local_A[0][50] * local_B[0][50];
char add_5_25 = mul_5_25_0 + local_A[0][51] * local_B[0][51];
char mul_5_26_0 = local_A[0][52] * local_B[0][52];
char add_5_26 = mul_5_26_0 + local_A[0][53] * local_B[0][53];
char mul_5_27_0 = local_A[0][54] * local_B[0][54];
char add_5_27 = mul_5_27_0 + local_A[0][55] * local_B[0][55];
char mul_5_28_0 = local_A[0][56] * local_B[0][56];
char add_5_28 = mul_5_28_0 + local_A[0][57] * local_B[0][57];
char mul_5_29_0 = local_A[0][58] * local_B[0][58];
char add_5_29 = mul_5_29_0 + local_A[0][59] * local_B[0][59];
char mul_5_30_0 = local_A[0][60] * local_B[0][60];
char add_5_30 = mul_5_30_0 + local_A[0][61] * local_B[0][61];
char mul_5_31_0 = local_A[0][62] * local_B[0][62];
char add_5_31 = mul_5_31_0 + local_A[0][63] * local_B[0][63];
char add_4_0 = add_5_0 + add_5_1;
char add_4_1 = add_5_2 + add_5_3;
char add_4_2 = add_5_4 + add_5_5;
char add_4_3 = add_5_6 + add_5_7;
char add_4_4 = add_5_8 + add_5_9;
char add_4_5 = add_5_10 + add_5_11;
char add_4_6 = add_5_12 + add_5_13;
char add_4_7 = add_5_14 + add_5_15;
char add_4_8 = add_5_16 + add_5_17;
char add_4_9 = add_5_18 + add_5_19;
char add_4_10 = add_5_20 + add_5_21;
char add_4_11 = add_5_22 + add_5_23;
char add_4_12 = add_5_24 + add_5_25;
char add_4_13 = add_5_26 + add_5_27;
char add_4_14 = add_5_28 + add_5_29;
char add_4_15 = add_5_30 + add_5_31;
char add_3_0 = add_4_0 + add_4_1;
char add_3_1 = add_4_2 + add_4_3;
char add_3_2 = add_4_4 + add_4_5;
char add_3_3 = add_4_6 + add_4_7;
char add_3_4 = add_4_8 + add_4_9;
char add_3_5 = add_4_10 + add_4_11;
char add_3_6 = add_4_12 + add_4_13;
char add_3_7 = add_4_14 + add_4_15;
char add_2_0 = add_3_0 + add_3_1;
char add_2_1 = add_3_2 + add_3_3;
char add_2_2 = add_3_4 + add_3_5;
char add_2_3 = add_3_6 + add_3_7;
char add_1_0 = add_2_0 + add_2_1;
char add_1_1 = add_2_2 + add_2_3;
char add_0_0 = add_1_0 + add_1_1;
#pragma HLS RESOURCE variable=mul_5_0_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_1_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_2_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_3_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_4_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_5_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_6_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_7_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_8_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_9_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_10_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_11_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_12_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_13_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_14_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_15_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_16_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_17_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_18_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_19_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_20_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_21_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_22_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_23_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_24_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_25_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_26_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_27_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_28_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_29_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_30_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_31_0 core=Mul_LUT
#pragma HLS RESOURCE variable=add_4_0 core=AddSub
#pragma HLS RESOURCE variable=add_4_1 core=AddSub
#pragma HLS RESOURCE variable=add_4_2 core=AddSub
#pragma HLS RESOURCE variable=add_4_3 core=AddSub
#pragma HLS RESOURCE variable=add_4_4 core=AddSub
#pragma HLS RESOURCE variable=add_4_5 core=AddSub
#pragma HLS RESOURCE variable=add_4_6 core=AddSub
#pragma HLS RESOURCE variable=add_4_7 core=AddSub
#pragma HLS RESOURCE variable=add_4_8 core=AddSub
#pragma HLS RESOURCE variable=add_4_9 core=AddSub
#pragma HLS RESOURCE variable=add_4_10 core=AddSub
#pragma HLS RESOURCE variable=add_4_11 core=AddSub
#pragma HLS RESOURCE variable=add_4_12 core=AddSub
#pragma HLS RESOURCE variable=add_4_13 core=AddSub
#pragma HLS RESOURCE variable=add_4_14 core=AddSub
#pragma HLS RESOURCE variable=add_4_15 core=AddSub
#pragma HLS RESOURCE variable=add_3_0 core=AddSub
#pragma HLS RESOURCE variable=add_3_1 core=AddSub
#pragma HLS RESOURCE variable=add_3_2 core=AddSub
#pragma HLS RESOURCE variable=add_3_3 core=AddSub
#pragma HLS RESOURCE variable=add_3_4 core=AddSub
#pragma HLS RESOURCE variable=add_3_5 core=AddSub
#pragma HLS RESOURCE variable=add_3_6 core=AddSub
#pragma HLS RESOURCE variable=add_3_7 core=AddSub
#pragma HLS RESOURCE variable=add_2_0 core=AddSub
#pragma HLS RESOURCE variable=add_2_1 core=AddSub
#pragma HLS RESOURCE variable=add_2_2 core=AddSub
#pragma HLS RESOURCE variable=add_2_3 core=AddSub
#pragma HLS RESOURCE variable=add_1_0 core=AddSub
#pragma HLS RESOURCE variable=add_1_1 core=AddSub
#pragma HLS RESOURCE variable=add_0_0 core=AddSub
local_C[c7][c6] += add_0_0;


================================================
FILE: autosa_tests/large/mm_int8/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[3]


================================================
FILE: autosa_tests/large/mm_int8/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/large/mm_int8/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  static data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = 1;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = 1;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (abs(C_golden[i][j] - C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/large/mm_int8/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef char data_t;
//#define I 1024 
//#define J 1024 
//#define K 1024 

// Test case 1
// kernel3 2D IxJ
#define I 1056
#define J 1024 
#define K 1024 

================================================
FILE: autosa_tests/large/mm_int8/kernel_kernel_opt.cpp
================================================
#include <ap_int.h>
#include <hls_stream.h>

#define min(x,y) ((x < y) ? x : y)
#define max(x,y) ((x > y) ? x : y)

/* Data Type */
typedef char A_t1;
typedef char B_t1;
typedef char C_t1;
typedef ap_uint<512> A_t64;
typedef ap_uint<512> B_t64;
typedef ap_uint<256> C_t32;
/* Data Type */

extern "C" {
void kernel0(A_t64 *A, B_t64 *B, C_t32 *C);
}
void A_IO_L2_in_intra_trans(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_local_out, bool intra_trans_en);
void A_IO_L2_in_inter_trans(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, bool inter_trans_en);
void A_IO_L2_in_inter_trans_boundary(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_in, bool inter_trans_en);
void B_IO_L2_in_intra_trans(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_local_out, bool intra_trans_en);
void B_IO_L2_in_inter_trans(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, bool inter_trans_en);
void B_IO_L2_in_inter_trans_boundary(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_in, bool inter_trans_en);
void PE_wrapper(int idx, int idy, hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, hls::stream<char> &fifo_C_drain_out);
void C_drain_IO_L1_out_intra_trans(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<char> &fifo_C_drain_local_in);
void C_drain_IO_L1_out_inter_trans(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out);
void C_drain_IO_L1_out_inter_trans_boundary(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<C_t32> &fifo_C_drain_out);
void C_drain_IO_L1_out_wrapper(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in);
void C_drain_IO_L1_out_boundary_wrapper(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in);

/* Module Definition */
void A_IO_L3_in(hls::stream<A_t64> &fifo_A_serialize, hls::stream<A_t64> &fifo_A_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
      for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
        // array
        // io_L3
        for (ap_uint<6> c3 = 0; c3 <= 23; c3 += 1) {
          // io_L2
          for (ap_uint<5> c4 = 0; c4 <= 10; c4 += 1) {
          #pragma HLS PIPELINE II=1
            // access_coalesce
            // access_serialize
            {
              A_t64 in_data;
              A_t64 out_data;
              in_data = fifo_A_serialize.read();
              out_data = in_data;
              fifo_A_local_out.write(out_data);
            }
          }
        }
      }
}
/* Module Definition */

/* Module Definition */
void A_IO_L3_in_serialize(A_t64 *A, hls::stream<A_t64> &fifo_A_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<18> i = 0; i < 67584; i++) {
  #pragma HLS PIPELINE II=1
    A_t64 fifo_data;
    fifo_data = A[i];
    fifo_A_local_out.write(fifo_data);
  }
}
/* Module Definition */

/* Module Definition */
void A_IO_L2_in_intra_trans(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_local_out, bool intra_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!intra_trans_en) return;


  // io_L2
  // io_L1
  // pe
  // latency
  for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
    // latency
    for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
    #pragma HLS PIPELINE II=1
      // simd
      {
        A_t64 in_data;
        A_t64 out_data;
        in_data = local_A[c7][0];
        out_data = in_data;
        fifo_A_local_out.write(out_data);
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void A_IO_L2_in_inter_trans(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, bool inter_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!inter_trans_en) return;

  for (ap_uint<6> c3 = p0; c3 <= 23; c3 += 1) {
    // io_L2
    if (c3 == p0) {
      for (ap_uint<5> c4 = 0; c4 <= 10; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          A_t64 in_data;
          A_t64 out_data;
          in_data = fifo_A_in.read();
          out_data = in_data;
          local_A[c4][0] = out_data;
        }
      }
    } else {
      for (ap_uint<5> c4 = 0; c4 <= 10; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          A_t64 in_data;
          A_t64 out_data;
          in_data = fifo_A_in.read();
          out_data = in_data;
          fifo_A_out.write(out_data);
        }
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void A_IO_L2_in_inter_trans_boundary(int idx, int c0, int c1, int c2, A_t64 local_A[11][1], hls::stream<A_t64> &fifo_A_in, bool inter_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!inter_trans_en) return;

  for (ap_uint<6> c3 = p0; c3 <= 23; c3 += 1)
    if (c3 == p0) {
      // io_L2
      for (ap_uint<5> c4 = 0; c4 <= 10; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          A_t64 in_data;
          A_t64 out_data;
          in_data = fifo_A_in.read();
          out_data = in_data;
          local_A[c4][0] = out_data;
        }
      }
    }
}
/* Module Definition */

/* Module Definition */
void A_IO_L2_in(int idx, hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, hls::stream<A_t64> &fifo_A_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  A_t64 local_A_ping[11][1];
  #pragma HLS RESOURCE variable=local_A_ping core=RAM_1P_BRAM
  A_t64 local_A_pong[11][1];
  #pragma HLS RESOURCE variable=local_A_pong core=RAM_1P_BRAM
  bool arb = 0;
  bool inter_trans_en = 1;
  bool intra_trans_en = 0;
  int c0, c0_prev;
  int c1, c1_prev;
  int c2, c2_prev;
  /* Variable Declaration */

  {
    for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
      for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
        for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
          // array
          // io_L3
          {
            if (arb == 0) {
              A_IO_L2_in_inter_trans(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_A_pong, 
                /* fifo */ fifo_A_in, 
                /* fifo */ fifo_A_out, 
                /* enable */ inter_trans_en
              );
              A_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_A_ping, 
                /* fifo */ fifo_A_local_out, 
                /* enable */ intra_trans_en
              );
            } else {
              A_IO_L2_in_inter_trans(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_A_ping, 
                /* fifo */ fifo_A_in, 
                /* fifo */ fifo_A_out, 
                /* enable */ inter_trans_en
              );
              A_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_A_pong, 
                /* fifo */ fifo_A_local_out, 
                /* enable */ intra_trans_en
              );
            }
            intra_trans_en = 1;
            arb = !arb;
            c0_prev = c0;
            c1_prev = c1;
            c2_prev = c2;
          }
        }
    if (arb == 0) {
      A_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_A_ping, 
        /* fifo */ fifo_A_local_out, 
        /* enable */ intra_trans_en
      );
    } else {
      A_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_A_pong, 
        /* fifo */ fifo_A_local_out, 
        /* enable */ intra_trans_en
      );
    }
  }
}
/* Module Definition */

/* Module Definition */
void A_IO_L2_in_boundary(int idx, hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  A_t64 local_A_ping[11][1];
  #pragma HLS RESOURCE variable=local_A_ping core=RAM_1P_BRAM
  A_t64 local_A_pong[11][1];
  #pragma HLS RESOURCE variable=local_A_pong core=RAM_1P_BRAM
  bool arb = 0;
  bool inter_trans_en = 1;
  bool intra_trans_en = 0;
  int c0, c0_prev;
  int c1, c1_prev;
  int c2, c2_prev;
  /* Variable Declaration */

  {
    for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
      for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
        for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
          // array
          // io_L3
          {
            if (arb == 0) {
              A_IO_L2_in_inter_trans_boundary(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_A_pong, 
                /* fifo */ fifo_A_in, 
                /* enable */ inter_trans_en
              );
              A_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_A_ping, 
                /* fifo */ fifo_A_local_out, 
                /* enable */ intra_trans_en
              );
            } else {
              A_IO_L2_in_inter_trans_boundary(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_A_ping, 
                /* fifo */ fifo_A_in, 
                /* enable */ inter_trans_en
              );
              A_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_A_pong, 
                /* fifo */ fifo_A_local_out, 
                /* enable */ intra_trans_en
              );
            }
            intra_trans_en = 1;
            arb = !arb;
            c0_prev = c0;
            c1_prev = c1;
            c2_prev = c2;
          }
        }
    if (arb == 0) {
      A_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_A_ping, 
        /* fifo */ fifo_A_local_out, 
        /* enable */ intra_trans_en
      );
    } else {
      A_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_A_pong, 
        /* fifo */ fifo_A_local_out, 
        /* enable */ intra_trans_en
      );
    }
  }
}
/* Module Definition */

/* Module Definition */
void B_IO_L3_in(hls::stream<B_t64> &fifo_B_serialize, hls::stream<B_t64> &fifo_B_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
      for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
        // array
        // io_L3
        for (ap_uint<4> c3 = 0; c3 <= 7; c3 += 1) {
          // io_L2
          for (ap_uint<6> c4 = 0; c4 <= 31; c4 += 1) {
          #pragma HLS PIPELINE II=1
            // access_coalesce
            // access_serialize
            {
              B_t64 in_data;
              B_t64 out_data;
              in_data = fifo_B_serialize.read();
              out_data = in_data;
              fifo_B_local_out.write(out_data);
            }
          }
        }
      }
}
/* Module Definition */

/* Module Definition */
void B_IO_L3_in_serialize(B_t64 *B, hls::stream<B_t64> &fifo_B_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<17> i = 0; i < 65536; i++) {
  #pragma HLS PIPELINE II=1
    B_t64 fifo_data;
    fifo_data = B[i];
    fifo_B_local_out.write(fifo_data);
  }
}
/* Module Definition */

/* Module Definition */
void B_IO_L2_in_intra_trans(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_local_out, bool intra_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!intra_trans_en) return;


  // io_L2
  // io_L1
  // pe
  // latency
  for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
    // latency
    for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
    #pragma HLS PIPELINE II=1
      // simd
      {
        B_t64 in_data;
        B_t64 out_data;
        in_data = local_B[c6][0];
        out_data = in_data;
        fifo_B_local_out.write(out_data);
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void B_IO_L2_in_inter_trans(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, bool inter_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!inter_trans_en) return;

  for (ap_uint<4> c3 = p0; c3 <= 7; c3 += 1) {
    // io_L2
    if (c3 == p0) {
      for (ap_uint<6> c4 = 0; c4 <= 31; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          B_t64 in_data;
          B_t64 out_data;
          in_data = fifo_B_in.read();
          out_data = in_data;
          local_B[c4][0] = out_data;
        }
      }
    } else {
      for (ap_uint<6> c4 = 0; c4 <= 31; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          B_t64 in_data;
          B_t64 out_data;
          in_data = fifo_B_in.read();
          out_data = in_data;
          fifo_B_out.write(out_data);
        }
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void B_IO_L2_in_inter_trans_boundary(int idx, int c0, int c1, int c2, B_t64 local_B[32][1], hls::stream<B_t64> &fifo_B_in, bool inter_trans_en)
 {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  if (!inter_trans_en) return;

  for (ap_uint<4> c3 = p0; c3 <= 7; c3 += 1)
    if (c3 == p0) {
      // io_L2
      for (ap_uint<6> c4 = 0; c4 <= 31; c4 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          B_t64 in_data;
          B_t64 out_data;
          in_data = fifo_B_in.read();
          out_data = in_data;
          local_B[c4][0] = out_data;
        }
      }
    }
}
/* Module Definition */

/* Module Definition */
void B_IO_L2_in(int idx, hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, hls::stream<B_t64> &fifo_B_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  B_t64 local_B_ping[32][1];
  #pragma HLS RESOURCE variable=local_B_ping core=RAM_1P_BRAM
  B_t64 local_B_pong[32][1];
  #pragma HLS RESOURCE variable=local_B_pong core=RAM_1P_BRAM
  bool arb = 0;
  bool inter_trans_en = 1;
  bool intra_trans_en = 0;
  int c0, c0_prev;
  int c1, c1_prev;
  int c2, c2_prev;
  /* Variable Declaration */

  {
    for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
      for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
        for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
          // array
          // io_L3
          {
            if (arb == 0) {
              B_IO_L2_in_inter_trans(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_B_pong, 
                /* fifo */ fifo_B_in, 
                /* fifo */ fifo_B_out, 
                /* enable */ inter_trans_en
              );
              B_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_B_ping, 
                /* fifo */ fifo_B_local_out, 
                /* enable */ intra_trans_en
              );
            } else {
              B_IO_L2_in_inter_trans(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_B_ping, 
                /* fifo */ fifo_B_in, 
                /* fifo */ fifo_B_out, 
                /* enable */ inter_trans_en
              );
              B_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_B_pong, 
                /* fifo */ fifo_B_local_out, 
                /* enable */ intra_trans_en
              );
            }
            intra_trans_en = 1;
            arb = !arb;
            c0_prev = c0;
            c1_prev = c1;
            c2_prev = c2;
          }
        }
    if (arb == 0) {
      B_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_B_ping, 
        /* fifo */ fifo_B_local_out, 
        /* enable */ intra_trans_en
      );
    } else {
      B_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_B_pong, 
        /* fifo */ fifo_B_local_out, 
        /* enable */ intra_trans_en
      );
    }
  }
}
/* Module Definition */

/* Module Definition */
void B_IO_L2_in_boundary(int idx, hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_local_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  B_t64 local_B_ping[32][1];
  #pragma HLS RESOURCE variable=local_B_ping core=RAM_1P_BRAM
  B_t64 local_B_pong[32][1];
  #pragma HLS RESOURCE variable=local_B_pong core=RAM_1P_BRAM
  bool arb = 0;
  bool inter_trans_en = 1;
  bool intra_trans_en = 0;
  int c0, c0_prev;
  int c1, c1_prev;
  int c2, c2_prev;
  /* Variable Declaration */

  {
    for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
      for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
        for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
          // array
          // io_L3
          {
            if (arb == 0) {
              B_IO_L2_in_inter_trans_boundary(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_B_pong, 
                /* fifo */ fifo_B_in, 
                /* enable */ inter_trans_en
              );
              B_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_B_ping, 
                /* fifo */ fifo_B_local_out, 
                /* enable */ intra_trans_en
              );
            } else {
              B_IO_L2_in_inter_trans_boundary(
                /* module id */ idx, 
                /* host iter */ c0, 
                /* host iter */ c1, 
                /* host iter */ c2, 
                /* array */ local_B_ping, 
                /* fifo */ fifo_B_in, 
                /* enable */ inter_trans_en
              );
              B_IO_L2_in_intra_trans(
                /* module id */ idx, 
                /* host iter */ c0_prev, 
                /* host iter */ c1_prev, 
                /* host iter */ c2_prev, 
                /* array */ local_B_pong, 
                /* fifo */ fifo_B_local_out, 
                /* enable */ intra_trans_en
              );
            }
            intra_trans_en = 1;
            arb = !arb;
            c0_prev = c0;
            c1_prev = c1;
            c2_prev = c2;
          }
        }
    if (arb == 0) {
      B_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_B_ping, 
        /* fifo */ fifo_B_local_out, 
        /* enable */ intra_trans_en
      );
    } else {
      B_IO_L2_in_intra_trans(
        /* module id */ idx, 
        /* host iter */ c0_prev, 
        /* host iter */ c1_prev, 
        /* host iter */ c2_prev, 
        /* array */ local_B_pong, 
        /* fifo */ fifo_B_local_out, 
        /* enable */ intra_trans_en
      );
    }
  }
}
/* Module Definition */

/* Module Definition */
void PE(int idx, int idy, hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, hls::stream<char> &fifo_C_drain_out) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  A_t1 local_A[1][64];
  #pragma HLS ARRAY_PARTITION variable=local_A dim=0 complete
  B_t1 local_B[1][64];
  #pragma HLS ARRAY_PARTITION variable=local_B dim=0 complete
  C_t1 local_C[11][32];
  #pragma HLS RESOURCE variable=local_C core=RAM_2P_BRAM
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
      for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
        // array
        // pe
        // latency
        for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
          // latency
          for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
          #pragma HLS PIPELINE II=1
            {
              {
                A_t64 fifo_data;
                fifo_data = fifo_A_in.read();
                for (ap_uint<7> n = 0; n < 64; n++) {
                #pragma HLS UNROLL
                  union {unsigned int ui; char ut;} u;
                  u.ui = (unsigned int)fifo_data(7, 0);
                  local_A[0][n] = u.ut;
                  fifo_data = fifo_data >> 8;
                }
              }
              {
                B_t64 fifo_data;
                fifo_data = fifo_B_in.read();
                for (ap_uint<7> n = 0; n < 64; n++) {
                #pragma HLS UNROLL
                  union {unsigned int ui; char ut;} u;
                  u.ui = (unsigned int)fifo_data(7, 0);
                  local_B[0][n] = u.ut;
                  fifo_data = fifo_data >> 8;
                }
              }
              // simd
              {
                if (c2 == 0) {
                  // hls_unroll
                  local_C[c7][c6] = 0;
                }
                //for (ap_uint<7> c8 = 0; c8 <= 63; c8 += 1) {
                //#pragma HLS UNROLL
                //  local_C[c7][c6] = (local_C[c7][c6] + (local_A[0][c8] * local_B[0][c8]));
                //}
                char mul_5_0_0 = local_A[0][0] * local_B[0][0];
                char add_5_0 = mul_5_0_0 + local_A[0][1] * local_B[0][1];
                char mul_5_1_0 = local_A[0][2] * local_B[0][2];
                char add_5_1 = mul_5_1_0 + local_A[0][3] * local_B[0][3];
                char mul_5_2_0 = local_A[0][4] * local_B[0][4];
                char add_5_2 = mul_5_2_0 + local_A[0][5] * local_B[0][5];
                char mul_5_3_0 = local_A[0][6] * local_B[0][6];
                char add_5_3 = mul_5_3_0 + local_A[0][7] * local_B[0][7];
                char mul_5_4_0 = local_A[0][8] * local_B[0][8];
                char add_5_4 = mul_5_4_0 + local_A[0][9] * local_B[0][9];
                char mul_5_5_0 = local_A[0][10] * local_B[0][10];
                char add_5_5 = mul_5_5_0 + local_A[0][11] * local_B[0][11];
                char mul_5_6_0 = local_A[0][12] * local_B[0][12];
                char add_5_6 = mul_5_6_0 + local_A[0][13] * local_B[0][13];
                char mul_5_7_0 = local_A[0][14] * local_B[0][14];
                char add_5_7 = mul_5_7_0 + local_A[0][15] * local_B[0][15];
                char mul_5_8_0 = local_A[0][16] * local_B[0][16];
                char add_5_8 = mul_5_8_0 + local_A[0][17] * local_B[0][17];
                char mul_5_9_0 = local_A[0][18] * local_B[0][18];
                char add_5_9 = mul_5_9_0 + local_A[0][19] * local_B[0][19];
                char mul_5_10_0 = local_A[0][20] * local_B[0][20];
                char add_5_10 = mul_5_10_0 + local_A[0][21] * local_B[0][21];
                char mul_5_11_0 = local_A[0][22] * local_B[0][22];
                char add_5_11 = mul_5_11_0 + local_A[0][23] * local_B[0][23];
                char mul_5_12_0 = local_A[0][24] * local_B[0][24];
                char add_5_12 = mul_5_12_0 + local_A[0][25] * local_B[0][25];
                char mul_5_13_0 = local_A[0][26] * local_B[0][26];
                char add_5_13 = mul_5_13_0 + local_A[0][27] * local_B[0][27];
                char mul_5_14_0 = local_A[0][28] * local_B[0][28];
                char add_5_14 = mul_5_14_0 + local_A[0][29] * local_B[0][29];
                char mul_5_15_0 = local_A[0][30] * local_B[0][30];
                char add_5_15 = mul_5_15_0 + local_A[0][31] * local_B[0][31];
                char mul_5_16_0 = local_A[0][32] * local_B[0][32];
                char add_5_16 = mul_5_16_0 + local_A[0][33] * local_B[0][33];
                char mul_5_17_0 = local_A[0][34] * local_B[0][34];
                char add_5_17 = mul_5_17_0 + local_A[0][35] * local_B[0][35];
                char mul_5_18_0 = local_A[0][36] * local_B[0][36];
                char add_5_18 = mul_5_18_0 + local_A[0][37] * local_B[0][37];
                char mul_5_19_0 = local_A[0][38] * local_B[0][38];
                char add_5_19 = mul_5_19_0 + local_A[0][39] * local_B[0][39];
                char mul_5_20_0 = local_A[0][40] * local_B[0][40];
                char add_5_20 = mul_5_20_0 + local_A[0][41] * local_B[0][41];
                char mul_5_21_0 = local_A[0][42] * local_B[0][42];
                char add_5_21 = mul_5_21_0 + local_A[0][43] * local_B[0][43];
                char mul_5_22_0 = local_A[0][44] * local_B[0][44];
                char add_5_22 = mul_5_22_0 + local_A[0][45] * local_B[0][45];
                char mul_5_23_0 = local_A[0][46] * local_B[0][46];
                char add_5_23 = mul_5_23_0 + local_A[0][47] * local_B[0][47];
                char mul_5_24_0 = local_A[0][48] * local_B[0][48];
                char add_5_24 = mul_5_24_0 + local_A[0][49] * local_B[0][49];
                char mul_5_25_0 = local_A[0][50] * local_B[0][50];
                char add_5_25 = mul_5_25_0 + local_A[0][51] * local_B[0][51];
                char mul_5_26_0 = local_A[0][52] * local_B[0][52];
                char add_5_26 = mul_5_26_0 + local_A[0][53] * local_B[0][53];
                char mul_5_27_0 = local_A[0][54] * local_B[0][54];
                char add_5_27 = mul_5_27_0 + local_A[0][55] * local_B[0][55];
                char mul_5_28_0 = local_A[0][56] * local_B[0][56];
                char add_5_28 = mul_5_28_0 + local_A[0][57] * local_B[0][57];
                char mul_5_29_0 = local_A[0][58] * local_B[0][58];
                char add_5_29 = mul_5_29_0 + local_A[0][59] * local_B[0][59];
                char mul_5_30_0 = local_A[0][60] * local_B[0][60];
                char add_5_30 = mul_5_30_0 + local_A[0][61] * local_B[0][61];
                char mul_5_31_0 = local_A[0][62] * local_B[0][62];
                char add_5_31 = mul_5_31_0 + local_A[0][63] * local_B[0][63];
                char add_4_0 = add_5_0 + add_5_1;
                char add_4_1 = add_5_2 + add_5_3;
                char add_4_2 = add_5_4 + add_5_5;
                char add_4_3 = add_5_6 + add_5_7;
                char add_4_4 = add_5_8 + add_5_9;
                char add_4_5 = add_5_10 + add_5_11;
                char add_4_6 = add_5_12 + add_5_13;
                char add_4_7 = add_5_14 + add_5_15;
                char add_4_8 = add_5_16 + add_5_17;
                char add_4_9 = add_5_18 + add_5_19;
                char add_4_10 = add_5_20 + add_5_21;
                char add_4_11 = add_5_22 + add_5_23;
                char add_4_12 = add_5_24 + add_5_25;
                char add_4_13 = add_5_26 + add_5_27;
                char add_4_14 = add_5_28 + add_5_29;
                char add_4_15 = add_5_30 + add_5_31;
                char add_3_0 = add_4_0 + add_4_1;
                char add_3_1 = add_4_2 + add_4_3;
                char add_3_2 = add_4_4 + add_4_5;
                char add_3_3 = add_4_6 + add_4_7;
                char add_3_4 = add_4_8 + add_4_9;
                char add_3_5 = add_4_10 + add_4_11;
                char add_3_6 = add_4_12 + add_4_13;
                char add_3_7 = add_4_14 + add_4_15;
                char add_2_0 = add_3_0 + add_3_1;
                char add_2_1 = add_3_2 + add_3_3;
                char add_2_2 = add_3_4 + add_3_5;
                char add_2_3 = add_3_6 + add_3_7;
                char add_1_0 = add_2_0 + add_2_1;
                char add_1_1 = add_2_2 + add_2_3;
                char add_0_0 = add_1_0 + add_1_1;
#pragma HLS RESOURCE variable=mul_5_0_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_1_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_2_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_3_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_4_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_5_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_6_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_7_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_8_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_9_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_10_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_11_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_12_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_13_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_14_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_15_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_16_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_17_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_18_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_19_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_20_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_21_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_22_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_23_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_24_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_25_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_26_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_27_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_28_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_29_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_30_0 core=Mul_LUT
#pragma HLS RESOURCE variable=mul_5_31_0 core=Mul_LUT
#pragma HLS RESOURCE variable=add_4_0 core=AddSub
#pragma HLS RESOURCE variable=add_4_1 core=AddSub
#pragma HLS RESOURCE variable=add_4_2 core=AddSub
#pragma HLS RESOURCE variable=add_4_3 core=AddSub
#pragma HLS RESOURCE variable=add_4_4 core=AddSub
#pragma HLS RESOURCE variable=add_4_5 core=AddSub
#pragma HLS RESOURCE variable=add_4_6 core=AddSub
#pragma HLS RESOURCE variable=add_4_7 core=AddSub
#pragma HLS RESOURCE variable=add_4_8 core=AddSub
#pragma HLS RESOURCE variable=add_4_9 core=AddSub
#pragma HLS RESOURCE variable=add_4_10 core=AddSub
#pragma HLS RESOURCE variable=add_4_11 core=AddSub
#pragma HLS RESOURCE variable=add_4_12 core=AddSub
#pragma HLS RESOURCE variable=add_4_13 core=AddSub
#pragma HLS RESOURCE variable=add_4_14 core=AddSub
#pragma HLS RESOURCE variable=add_4_15 core=AddSub
#pragma HLS RESOURCE variable=add_3_0 core=AddSub
#pragma HLS RESOURCE variable=add_3_1 core=AddSub
#pragma HLS RESOURCE variable=add_3_2 core=AddSub
#pragma HLS RESOURCE variable=add_3_3 core=AddSub
#pragma HLS RESOURCE variable=add_3_4 core=AddSub
#pragma HLS RESOURCE variable=add_3_5 core=AddSub
#pragma HLS RESOURCE variable=add_3_6 core=AddSub
#pragma HLS RESOURCE variable=add_3_7 core=AddSub
#pragma HLS RESOURCE variable=add_2_0 core=AddSub
#pragma HLS RESOURCE variable=add_2_1 core=AddSub
#pragma HLS RESOURCE variable=add_2_2 core=AddSub
#pragma HLS RESOURCE variable=add_2_3 core=AddSub
#pragma HLS RESOURCE variable=add_1_0 core=AddSub
#pragma HLS RESOURCE variable=add_1_1 core=AddSub
#pragma HLS RESOURCE variable=add_0_0 core=AddSub
             
                local_C[c7][c6] += add_0_0;
               
              }
              if (c2 == 15)
                fifo_C_drain_out.write(local_C[c7][c6]);
              {
                B_t64 fifo_data;
                union {unsigned int ui; char ut;} u63, u62, u61, u60, u59, u58, u57, u56, u55, u54, u53, u52, u51, u50, u49, u48, u47, u46, u45, u44, u43, u42, u41, u40, u39, u38, u37, u36, u35, u34, u33, u32, u31, u30, u29, u28, u27, u26, u25, u24, u23, u22, u21, u20, u19, u18, u17, u16, u15, u14, u13, u12, u11, u10, u9, u8, u7, u6, u5, u4, u3, u2, u1, u0;
                u63.ut = local_B[0][63];
                u62.ut = local_B[0][62];
                u61.ut = local_B[0][61];
                u60.ut = local_B[0][60];
                u59.ut = local_B[0][59];
                u58.ut = local_B[0][58];
                u57.ut = local_B[0][57];
                u56.ut = local_B[0][56];
                u55.ut = local_B[0][55];
                u54.ut = local_B[0][54];
                u53.ut = local_B[0][53];
                u52.ut = local_B[0][52];
                u51.ut = local_B[0][51];
                u50.ut = local_B[0][50];
                u49.ut = local_B[0][49];
                u48.ut = local_B[0][48];
                u47.ut = local_B[0][47];
                u46.ut = local_B[0][46];
                u45.ut = local_B[0][45];
                u44.ut = local_B[0][44];
                u43.ut = local_B[0][43];
                u42.ut = local_B[0][42];
                u41.ut = local_B[0][41];
                u40.ut = local_B[0][40];
                u39.ut = local_B[0][39];
                u38.ut = local_B[0][38];
                u37.ut = local_B[0][37];
                u36.ut = local_B[0][36];
                u35.ut = local_B[0][35];
                u34.ut = local_B[0][34];
                u33.ut = local_B[0][33];
                u32.ut = local_B[0][32];
                u31.ut = local_B[0][31];
                u30.ut = local_B[0][30];
                u29.ut = local_B[0][29];
                u28.ut = local_B[0][28];
                u27.ut = local_B[0][27];
                u26.ut = local_B[0][26];
                u25.ut = local_B[0][25];
                u24.ut = local_B[0][24];
                u23.ut = local_B[0][23];
                u22.ut = local_B[0][22];
                u21.ut = local_B[0][21];
                u20.ut = local_B[0][20];
                u19.ut = local_B[0][19];
                u18.ut = local_B[0][18];
                u17.ut = local_B[0][17];
                u16.ut = local_B[0][16];
                u15.ut = local_B[0][15];
                u14.ut = local_B[0][14];
                u13.ut = local_B[0][13];
                u12.ut = local_B[0][12];
                u11.ut = local_B[0][11];
                u10.ut = local_B[0][10];
                u9.ut = local_B[0][9];
                u8.ut = local_B[0][8];
                u7.ut = local_B[0][7];
                u6.ut = local_B[0][6];
                u5.ut = local_B[0][5];
                u4.ut = local_B[0][4];
                u3.ut = local_B[0][3];
                u2.ut = local_B[0][2];
                u1.ut = local_B[0][1];
                u0.ut = local_B[0][0];
                fifo_data = (ap_uint<8>(u63.ui), ap_uint<8>(u62.ui), ap_uint<8>(u61.ui), ap_uint<8>(u60.ui), ap_uint<8>(u59.ui), ap_uint<8>(u58.ui), ap_uint<8>(u57.ui), ap_uint<8>(u56.ui), ap_uint<8>(u55.ui), ap_uint<8>(u54.ui), ap_uint<8>(u53.ui), ap_uint<8>(u52.ui), ap_uint<8>(u51.ui), ap_uint<8>(u50.ui), ap_uint<8>(u49.ui), ap_uint<8>(u48.ui), ap_uint<8>(u47.ui), ap_uint<8>(u46.ui), ap_uint<8>(u45.ui), ap_uint<8>(u44.ui), ap_uint<8>(u43.ui), ap_uint<8>(u42.ui), ap_uint<8>(u41.ui), ap_uint<8>(u40.ui), ap_uint<8>(u39.ui), ap_uint<8>(u38.ui), ap_uint<8>(u37.ui), ap_uint<8>(u36.ui), ap_uint<8>(u35.ui), ap_uint<8>(u34.ui), ap_uint<8>(u33.ui), ap_uint<8>(u32.ui), ap_uint<8>(u31.ui), ap_uint<8>(u30.ui), ap_uint<8>(u29.ui), ap_uint<8>(u28.ui), ap_uint<8>(u27.ui), ap_uint<8>(u26.ui), ap_uint<8>(u25.ui), ap_uint<8>(u24.ui), ap_uint<8>(u23.ui), ap_uint<8>(u22.ui), ap_uint<8>(u21.ui), ap_uint<8>(u20.ui), ap_uint<8>(u19.ui), ap_uint<8>(u18.ui), ap_uint<8>(u17.ui), ap_uint<8>(u16.ui), ap_uint<8>(u15.ui), ap_uint<8>(u14.ui), ap_uint<8>(u13.ui), ap_uint<8>(u12.ui), ap_uint<8>(u11.ui), ap_uint<8>(u10.ui), ap_uint<8>(u9.ui), ap_uint<8>(u8.ui), ap_uint<8>(u7.ui), ap_uint<8>(u6.ui), ap_uint<8>(u5.ui), ap_uint<8>(u4.ui), ap_uint<8>(u3.ui), ap_uint<8>(u2.ui), ap_uint<8>(u1.ui), ap_uint<8>(u0.ui));
                fifo_B_out.write(fifo_data);
              }
              {
                A_t64 fifo_data;
                union {unsigned int ui; char ut;} u63, u62, u61, u60, u59, u58, u57, u56, u55, u54, u53, u52, u51, u50, u49, u48, u47, u46, u45, u44, u43, u42, u41, u40, u39, u38, u37, u36, u35, u34, u33, u32, u31, u30, u29, u28, u27, u26, u25, u24, u23, u22, u21, u20, u19, u18, u17, u16, u15, u14, u13, u12, u11, u10, u9, u8, u7, u6, u5, u4, u3, u2, u1, u0;
                u63.ut = local_A[0][63];
                u62.ut = local_A[0][62];
                u61.ut = local_A[0][61];
                u60.ut = local_A[0][60];
                u59.ut = local_A[0][59];
                u58.ut = local_A[0][58];
                u57.ut = local_A[0][57];
                u56.ut = local_A[0][56];
                u55.ut = local_A[0][55];
                u54.ut = local_A[0][54];
                u53.ut = local_A[0][53];
                u52.ut = local_A[0][52];
                u51.ut = local_A[0][51];
                u50.ut = local_A[0][50];
                u49.ut = local_A[0][49];
                u48.ut = local_A[0][48];
                u47.ut = local_A[0][47];
                u46.ut = local_A[0][46];
                u45.ut = local_A[0][45];
                u44.ut = local_A[0][44];
                u43.ut = local_A[0][43];
                u42.ut = local_A[0][42];
                u41.ut = local_A[0][41];
                u40.ut = local_A[0][40];
                u39.ut = local_A[0][39];
                u38.ut = local_A[0][38];
                u37.ut = local_A[0][37];
                u36.ut = local_A[0][36];
                u35.ut = local_A[0][35];
                u34.ut = local_A[0][34];
                u33.ut = local_A[0][33];
                u32.ut = local_A[0][32];
                u31.ut = local_A[0][31];
                u30.ut = local_A[0][30];
                u29.ut = local_A[0][29];
                u28.ut = local_A[0][28];
                u27.ut = local_A[0][27];
                u26.ut = local_A[0][26];
                u25.ut = local_A[0][25];
                u24.ut = local_A[0][24];
                u23.ut = local_A[0][23];
                u22.ut = local_A[0][22];
                u21.ut = local_A[0][21];
                u20.ut = local_A[0][20];
                u19.ut = local_A[0][19];
                u18.ut = local_A[0][18];
                u17.ut = local_A[0][17];
                u16.ut = local_A[0][16];
                u15.ut = local_A[0][15];
                u14.ut = local_A[0][14];
                u13.ut = local_A[0][13];
                u12.ut = local_A[0][12];
                u11.ut = local_A[0][11];
                u10.ut = local_A[0][10];
                u9.ut = local_A[0][9];
                u8.ut = local_A[0][8];
                u7.ut = local_A[0][7];
                u6.ut = local_A[0][6];
                u5.ut = local_A[0][5];
                u4.ut = local_A[0][4];
                u3.ut = local_A[0][3];
                u2.ut = local_A[0][2];
                u1.ut = local_A[0][1];
                u0.ut = local_A[0][0];
                fifo_data = (ap_uint<8>(u63.ui), ap_uint<8>(u62.ui), ap_uint<8>(u61.ui), ap_uint<8>(u60.ui), ap_uint<8>(u59.ui), ap_uint<8>(u58.ui), ap_uint<8>(u57.ui), ap_uint<8>(u56.ui), ap_uint<8>(u55.ui), ap_uint<8>(u54.ui), ap_uint<8>(u53.ui), ap_uint<8>(u52.ui), ap_uint<8>(u51.ui), ap_uint<8>(u50.ui), ap_uint<8>(u49.ui), ap_uint<8>(u48.ui), ap_uint<8>(u47.ui), ap_uint<8>(u46.ui), ap_uint<8>(u45.ui), ap_uint<8>(u44.ui), ap_uint<8>(u43.ui), ap_uint<8>(u42.ui), ap_uint<8>(u41.ui), ap_uint<8>(u40.ui), ap_uint<8>(u39.ui), ap_uint<8>(u38.ui), ap_uint<8>(u37.ui), ap_uint<8>(u36.ui), ap_uint<8>(u35.ui), ap_uint<8>(u34.ui), ap_uint<8>(u33.ui), ap_uint<8>(u32.ui), ap_uint<8>(u31.ui), ap_uint<8>(u30.ui), ap_uint<8>(u29.ui), ap_uint<8>(u28.ui), ap_uint<8>(u27.ui), ap_uint<8>(u26.ui), ap_uint<8>(u25.ui), ap_uint<8>(u24.ui), ap_uint<8>(u23.ui), ap_uint<8>(u22.ui), ap_uint<8>(u21.ui), ap_uint<8>(u20.ui), ap_uint<8>(u19.ui), ap_uint<8>(u18.ui), ap_uint<8>(u17.ui), ap_uint<8>(u16.ui), ap_uint<8>(u15.ui), ap_uint<8>(u14.ui), ap_uint<8>(u13.ui), ap_uint<8>(u12.ui), ap_uint<8>(u11.ui), ap_uint<8>(u10.ui), ap_uint<8>(u9.ui), ap_uint<8>(u8.ui), ap_uint<8>(u7.ui), ap_uint<8>(u6.ui), ap_uint<8>(u5.ui), ap_uint<8>(u4.ui), ap_uint<8>(u3.ui), ap_uint<8>(u2.ui), ap_uint<8>(u1.ui), ap_uint<8>(u0.ui));
                fifo_A_out.write(fifo_data);
              }
            }
          }
        }
      }
}
/* Module Definition */

/* Module Definition */
void PE_wrapper(int idx, int idy, hls::stream<A_t64> &fifo_A_in, hls::stream<A_t64> &fifo_A_out, hls::stream<B_t64> &fifo_B_in, hls::stream<B_t64> &fifo_B_out, hls::stream<char> &fifo_C_drain_out)
 {
  PE(
    /* module id */ idx, 
    /* module id */ idy, 
    /* fifo */ fifo_A_in, 
    /* fifo */ fifo_A_out, 
    /* fifo */ fifo_B_in, 
    /* fifo */ fifo_B_out, 
    /* fifo */ fifo_C_drain_out);
}
/* Module Definition */

/* Module Definition */
void A_PE_dummy_in(int idx, int idy, hls::stream<A_t64> &fifo_A_in) {
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
      for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
        // array
        // pe
        // latency
        for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
          // latency
          for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
          #pragma HLS PIPELINE II=1
            A_t64 fifo_data;
            fifo_data = fifo_A_in.read();
          }
        }
      }
}
/* Module Definition */

/* Module Definition */
void B_PE_dummy_in(int idx, int idy, hls::stream<B_t64> &fifo_B_in) {
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
      for (ap_uint<5> c2 = 0; c2 <= 15; c2 += 1) {
        // array
        // pe
        // latency
        for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
          // latency
          for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
          #pragma HLS PIPELINE II=1
            B_t64 fifo_data;
            fifo_data = fifo_B_in.read();
          }
        }
      }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_intra_trans(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<char> &fifo_C_drain_local_in)
 {
#pragma HLS INLINE
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  ap_uint<8> data_split[32];
  #pragma HLS ARRAY_PARTITION variable=data_split complete
  /* Variable Declaration */


  // io_L1
  // pe
  // latency
  for (ap_uint<6> c6 = 0; c6 <= 31; c6 += 1) {
    // latency
    for (ap_uint<5> c7 = 0; c7 <= 10; c7 += 1) {
    #pragma HLS PIPELINE II=1
      // simd
      {
        C_t1 in_data;
        C_t32 out_data;
        in_data = fifo_C_drain_local_in.read();
        int split_idx = (c6) % 32;
        out_data = local_C[c7][c6 / 32];
        for (ap_uint<6> n = 0; n < 32; n++) {
        #pragma HLS UNROLL
          data_split[n] = out_data(7, 0);
          out_data = out_data >> 8;
        }
        union {unsigned int ui; char ut;} u;
        u.ut = in_data;
        data_split[split_idx] = ap_uint<8>(u.ui);
        out_data = (data_split[31], data_split[30], data_split[29], data_split[28], data_split[27], data_split[26], data_split[25], data_split[24], data_split[23], data_split[22], data_split[21], data_split[20], data_split[19], data_split[18], data_split[17], data_split[16], data_split[15], data_split[14], data_split[13], data_split[12], data_split[11], data_split[10], data_split[9], data_split[8], data_split[7], data_split[6], data_split[5], data_split[4], data_split[3], data_split[2], data_split[1], data_split[0]);
        local_C[c7][c6 / 32] = out_data;
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_inter_trans(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out)
 {
#pragma HLS INLINE
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  /* Variable Declaration */

  for (ap_uint<6> c4 = p1; c4 <= 23; c4 += 1) {
    // io_L1
    if (c4 == p1) {
      for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          C_t32 in_data;
          C_t32 out_data;
          in_data = local_C[c5][0];
          out_data = in_data;
          fifo_C_drain_out.write(out_data);
        }
      }
    } else {
      for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          C_t32 in_data;
          C_t32 out_data;
          in_data = fifo_C_drain_in.read();
          out_data = in_data;
          fifo_C_drain_out.write(out_data);
        }
      }
    }
  }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_inter_trans_boundary(int idx, int idy, int c0, int c1, C_t32 local_C[11][1], hls::stream<C_t32> &fifo_C_drain_out)
 {
#pragma HLS INLINE
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  /* Variable Declaration */

  for (ap_uint<6> c4 = p1; c4 <= 23; c4 += 1)
    if (c4 == p1) {
      // io_L1
      for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
      #pragma HLS PIPELINE II=1
        // access_coalesce
        {
          C_t32 in_data;
          C_t32 out_data;
          in_data = local_C[c5][0];
          out_data = in_data;
          fifo_C_drain_out.write(out_data);
        }
      }
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  C_t32 local_C[11][1];
  #pragma HLS RESOURCE variable=local_C core=RAM_2P_BRAM
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
      // array
      // io_L3
      // io_L2
      C_drain_IO_L1_out_intra_trans(
        /* module id */ idx, 
        /* module id */ idy, 
        /* host iter */ c0, 
        /* host iter */ c1, 
        /* array */ local_C, 
        /* fifo */ fifo_C_drain_local_in
      );
      C_drain_IO_L1_out_inter_trans(
        /* module id */ idx, 
        /* module id */ idy, 
        /* host iter */ c0, 
        /* host iter */ c1, 
        /* array */ local_C, 
        /* fifo */ fifo_C_drain_in, 
        /* fifo */ fifo_C_drain_out
      );
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_wrapper(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in)
 {
  C_drain_IO_L1_out(
    /* module id */ idx, 
    /* module id */ idy, 
    /* fifo */ fifo_C_drain_in, 
    /* fifo */ fifo_C_drain_out, 
    /* fifo */ fifo_C_drain_local_in);
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_boundary(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in) {
#pragma HLS INLINE
  /* Variable Declaration */
  int p0 = idx, p1 = idy; // module id
  C_t32 local_C[11][1];
  #pragma HLS RESOURCE variable=local_C core=RAM_2P_BRAM
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
      // array
      // io_L3
      // io_L2
      C_drain_IO_L1_out_intra_trans(
        /* module id */ idx, 
        /* module id */ idy, 
        /* host iter */ c0, 
        /* host iter */ c1, 
        /* array */ local_C, 
        /* fifo */ fifo_C_drain_local_in
      );
      C_drain_IO_L1_out_inter_trans_boundary(
        /* module id */ idx, 
        /* module id */ idy, 
        /* host iter */ c0, 
        /* host iter */ c1, 
        /* array */ local_C, 
        /* fifo */ fifo_C_drain_out
      );
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L1_out_boundary_wrapper(int idx, int idy, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<char> &fifo_C_drain_local_in)
 {
  C_drain_IO_L1_out_boundary(
    /* module id */ idx, 
    /* module id */ idy, 
    /* fifo */ fifo_C_drain_out, 
    /* fifo */ fifo_C_drain_local_in);
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L2_out(int idx, hls::stream<C_t32> &fifo_C_drain_in, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<C_t32> &fifo_C_drain_local_in) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
      // array
      // io_L3
      for (ap_uint<4> c3 = p0; c3 <= 7; c3 += 1) {
        // io_L2
        if (c3 == p0) {
          for (ap_uint<6> c4 = 0; c4 <= 23; c4 += 1) {
            // io_L1
            for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
            #pragma HLS PIPELINE II=1
              // access_coalesce
              {
                C_t32 in_data;
                C_t32 out_data;
                in_data = fifo_C_drain_local_in.read();
                out_data = in_data;
                fifo_C_drain_out.write(out_data);
              }
            }
          }
        } else {
          for (ap_uint<6> c4 = 0; c4 <= 23; c4 += 1) {
            // io_L1
            for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
            #pragma HLS PIPELINE II=1
              // access_coalesce
              {
                C_t32 in_data;
                C_t32 out_data;
                in_data = fifo_C_drain_in.read();
                out_data = in_data;
                fifo_C_drain_out.write(out_data);
              }
            }
          }
        }
      }
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L2_out_boundary(int idx, hls::stream<C_t32> &fifo_C_drain_out, hls::stream<C_t32> &fifo_C_drain_local_in) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  int p0 = idx; // module id
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
      // array
      // io_L3
      for (ap_uint<4> c3 = p0; c3 <= 7; c3 += 1)
        if (c3 == p0) {
          // io_L2
          for (ap_uint<6> c4 = 0; c4 <= 23; c4 += 1) {
            // io_L1
            for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
            #pragma HLS PIPELINE II=1
              // access_coalesce
              {
                C_t32 in_data;
                C_t32 out_data;
                in_data = fifo_C_drain_local_in.read();
                out_data = in_data;
                fifo_C_drain_out.write(out_data);
              }
            }
          }
        }
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L3_out(hls::stream<C_t32> &fifo_C_drain_serialize, hls::stream<C_t32> &fifo_C_drain_local_in) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
    for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
      // array
      // io_L3
      for (ap_uint<4> c3 = 0; c3 <= 7; c3 += 1) {
        // io_L2
        for (ap_uint<6> c4 = 0; c4 <= 23; c4 += 1) {
          // io_L1
          // pe
          for (ap_uint<5> c5 = 0; c5 <= 10; c5 += 1) {
          #pragma HLS PIPELINE II=1
            // access_coalesce
            // access_serialize
            {
              C_t32 in_data;
              C_t32 out_data;
              in_data = fifo_C_drain_local_in.read();
              out_data = in_data;
              fifo_C_drain_serialize.write(out_data);
            }
          }
        }
      }
    }
}
/* Module Definition */

/* Module Definition */
void C_drain_IO_L3_out_serialize(C_t32 *C, hls::stream<C_t32> &fifo_C_drain_local_in) {
#pragma HLS INLINE OFF
  /* Variable Declaration */
  /* Variable Declaration */

  for (ap_uint<17> i = 0; i < 33792; i++) {
  #pragma HLS PIPELINE II=1
    C_t32 fifo_data;
    fifo_data = fifo_C_drain_local_in.read();
    C[i] = fifo_data;
  }
}
/* Module Definition */

extern "C" {
void kernel0(A_t64 *A, B_t64 *B, C_t32 *C)
{
#pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A
#pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B
#pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C
#pragma HLS INTERFACE s_axilite port=A bundle=control
#pragma HLS INTERFACE s_axilite port=B bundle=control
#pragma HLS INTERFACE s_axilite port=C bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control

#pragma HLS DATAFLOW
#pragma HLS dataflow disable_start_propagation

  /* FIFO Declaration */
  /* A_IO_L3_in_serialize fifo */ hls::stream<A_t64> fifo_A_A_IO_L3_in_serialize;
  #pragma HLS STREAM variable=fifo_A_A_IO_L3_in_serialize depth=2
  /* B_IO_L3_in_serialize fifo */ hls::stream<B_t64> fifo_B_B_IO_L3_in_serialize;
  #pragma HLS STREAM variable=fifo_B_B_IO_L3_in_serialize depth=2
  /* C_drain_IO_L3_out_serialize fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L3_out_serialize;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L3_out_serialize depth=2
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_0;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_0 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_1;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_1 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_2;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_2 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_3;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_3 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_4;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_4 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_5;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_5 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_6;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_6 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_7;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_7 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_8;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_8 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_9;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_9 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_9 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_10;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_10 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_10 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_11;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_11 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_11 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_12;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_12 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_12 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_13;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_13 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_13 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_14;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_14 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_14 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_15;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_15 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_15 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_16;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_16 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_16 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_17;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_17 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_17 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_18;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_18 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_18 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_19;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_19 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_19 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_20;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_20 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_20 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_21;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_21 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_21 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_22;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_22 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_22 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_23;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_23 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_23 core=FIFO_SRL
  /* A_IO_L2_in fifo */ hls::stream<A_t64> fifo_A_A_IO_L2_in_24;
  #pragma HLS STREAM variable=fifo_A_A_IO_L2_in_24 depth=2
  #pragma HLS RESOURCE variable=fifo_A_A_IO_L2_in_24 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_0;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_0 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_1;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_1 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_2;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_2 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_3;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_3 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_4;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_4 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_5;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_5 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_6;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_6 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_7;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_7 core=FIFO_SRL
  /* B_IO_L2_in fifo */ hls::stream<B_t64> fifo_B_B_IO_L2_in_8;
  #pragma HLS STREAM variable=fifo_B_B_IO_L2_in_8 depth=2
  #pragma HLS RESOURCE variable=fifo_B_B_IO_L2_in_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_0;
  #pragma HLS STREAM variable=fifo_A_PE_0_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_1;
  #pragma HLS STREAM variable=fifo_A_PE_0_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_2;
  #pragma HLS STREAM variable=fifo_A_PE_0_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_3;
  #pragma HLS STREAM variable=fifo_A_PE_0_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_4;
  #pragma HLS STREAM variable=fifo_A_PE_0_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_5;
  #pragma HLS STREAM variable=fifo_A_PE_0_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_6;
  #pragma HLS STREAM variable=fifo_A_PE_0_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_7;
  #pragma HLS STREAM variable=fifo_A_PE_0_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_0_8;
  #pragma HLS STREAM variable=fifo_A_PE_0_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_0_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_0;
  #pragma HLS STREAM variable=fifo_A_PE_1_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_1;
  #pragma HLS STREAM variable=fifo_A_PE_1_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_2;
  #pragma HLS STREAM variable=fifo_A_PE_1_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_3;
  #pragma HLS STREAM variable=fifo_A_PE_1_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_4;
  #pragma HLS STREAM variable=fifo_A_PE_1_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_5;
  #pragma HLS STREAM variable=fifo_A_PE_1_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_6;
  #pragma HLS STREAM variable=fifo_A_PE_1_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_7;
  #pragma HLS STREAM variable=fifo_A_PE_1_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_1_8;
  #pragma HLS STREAM variable=fifo_A_PE_1_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_1_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_0;
  #pragma HLS STREAM variable=fifo_A_PE_2_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_1;
  #pragma HLS STREAM variable=fifo_A_PE_2_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_2;
  #pragma HLS STREAM variable=fifo_A_PE_2_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_3;
  #pragma HLS STREAM variable=fifo_A_PE_2_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_4;
  #pragma HLS STREAM variable=fifo_A_PE_2_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_5;
  #pragma HLS STREAM variable=fifo_A_PE_2_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_6;
  #pragma HLS STREAM variable=fifo_A_PE_2_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_7;
  #pragma HLS STREAM variable=fifo_A_PE_2_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_2_8;
  #pragma HLS STREAM variable=fifo_A_PE_2_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_2_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_0;
  #pragma HLS STREAM variable=fifo_A_PE_3_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_1;
  #pragma HLS STREAM variable=fifo_A_PE_3_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_2;
  #pragma HLS STREAM variable=fifo_A_PE_3_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_3;
  #pragma HLS STREAM variable=fifo_A_PE_3_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_4;
  #pragma HLS STREAM variable=fifo_A_PE_3_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_5;
  #pragma HLS STREAM variable=fifo_A_PE_3_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_6;
  #pragma HLS STREAM variable=fifo_A_PE_3_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_7;
  #pragma HLS STREAM variable=fifo_A_PE_3_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_3_8;
  #pragma HLS STREAM variable=fifo_A_PE_3_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_3_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_0;
  #pragma HLS STREAM variable=fifo_A_PE_4_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_1;
  #pragma HLS STREAM variable=fifo_A_PE_4_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_2;
  #pragma HLS STREAM variable=fifo_A_PE_4_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_3;
  #pragma HLS STREAM variable=fifo_A_PE_4_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_4;
  #pragma HLS STREAM variable=fifo_A_PE_4_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_5;
  #pragma HLS STREAM variable=fifo_A_PE_4_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_6;
  #pragma HLS STREAM variable=fifo_A_PE_4_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_7;
  #pragma HLS STREAM variable=fifo_A_PE_4_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_4_8;
  #pragma HLS STREAM variable=fifo_A_PE_4_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_4_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_0;
  #pragma HLS STREAM variable=fifo_A_PE_5_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_1;
  #pragma HLS STREAM variable=fifo_A_PE_5_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_2;
  #pragma HLS STREAM variable=fifo_A_PE_5_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_3;
  #pragma HLS STREAM variable=fifo_A_PE_5_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_4;
  #pragma HLS STREAM variable=fifo_A_PE_5_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_5;
  #pragma HLS STREAM variable=fifo_A_PE_5_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_6;
  #pragma HLS STREAM variable=fifo_A_PE_5_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_7;
  #pragma HLS STREAM variable=fifo_A_PE_5_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_5_8;
  #pragma HLS STREAM variable=fifo_A_PE_5_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_5_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_0;
  #pragma HLS STREAM variable=fifo_A_PE_6_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_1;
  #pragma HLS STREAM variable=fifo_A_PE_6_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_2;
  #pragma HLS STREAM variable=fifo_A_PE_6_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_3;
  #pragma HLS STREAM variable=fifo_A_PE_6_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_4;
  #pragma HLS STREAM variable=fifo_A_PE_6_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_5;
  #pragma HLS STREAM variable=fifo_A_PE_6_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_6;
  #pragma HLS STREAM variable=fifo_A_PE_6_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_7;
  #pragma HLS STREAM variable=fifo_A_PE_6_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_6_8;
  #pragma HLS STREAM variable=fifo_A_PE_6_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_6_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_0;
  #pragma HLS STREAM variable=fifo_A_PE_7_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_1;
  #pragma HLS STREAM variable=fifo_A_PE_7_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_2;
  #pragma HLS STREAM variable=fifo_A_PE_7_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_3;
  #pragma HLS STREAM variable=fifo_A_PE_7_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_4;
  #pragma HLS STREAM variable=fifo_A_PE_7_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_5;
  #pragma HLS STREAM variable=fifo_A_PE_7_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_6;
  #pragma HLS STREAM variable=fifo_A_PE_7_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_7;
  #pragma HLS STREAM variable=fifo_A_PE_7_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_7_8;
  #pragma HLS STREAM variable=fifo_A_PE_7_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_7_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_0;
  #pragma HLS STREAM variable=fifo_A_PE_8_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_1;
  #pragma HLS STREAM variable=fifo_A_PE_8_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_2;
  #pragma HLS STREAM variable=fifo_A_PE_8_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_3;
  #pragma HLS STREAM variable=fifo_A_PE_8_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_4;
  #pragma HLS STREAM variable=fifo_A_PE_8_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_5;
  #pragma HLS STREAM variable=fifo_A_PE_8_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_6;
  #pragma HLS STREAM variable=fifo_A_PE_8_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_7;
  #pragma HLS STREAM variable=fifo_A_PE_8_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_8_8;
  #pragma HLS STREAM variable=fifo_A_PE_8_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_8_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_0;
  #pragma HLS STREAM variable=fifo_A_PE_9_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_1;
  #pragma HLS STREAM variable=fifo_A_PE_9_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_2;
  #pragma HLS STREAM variable=fifo_A_PE_9_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_3;
  #pragma HLS STREAM variable=fifo_A_PE_9_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_4;
  #pragma HLS STREAM variable=fifo_A_PE_9_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_5;
  #pragma HLS STREAM variable=fifo_A_PE_9_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_6;
  #pragma HLS STREAM variable=fifo_A_PE_9_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_7;
  #pragma HLS STREAM variable=fifo_A_PE_9_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_9_8;
  #pragma HLS STREAM variable=fifo_A_PE_9_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_9_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_0;
  #pragma HLS STREAM variable=fifo_A_PE_10_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_1;
  #pragma HLS STREAM variable=fifo_A_PE_10_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_2;
  #pragma HLS STREAM variable=fifo_A_PE_10_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_3;
  #pragma HLS STREAM variable=fifo_A_PE_10_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_4;
  #pragma HLS STREAM variable=fifo_A_PE_10_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_5;
  #pragma HLS STREAM variable=fifo_A_PE_10_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_6;
  #pragma HLS STREAM variable=fifo_A_PE_10_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_7;
  #pragma HLS STREAM variable=fifo_A_PE_10_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_10_8;
  #pragma HLS STREAM variable=fifo_A_PE_10_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_10_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_0;
  #pragma HLS STREAM variable=fifo_A_PE_11_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_1;
  #pragma HLS STREAM variable=fifo_A_PE_11_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_2;
  #pragma HLS STREAM variable=fifo_A_PE_11_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_3;
  #pragma HLS STREAM variable=fifo_A_PE_11_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_4;
  #pragma HLS STREAM variable=fifo_A_PE_11_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_5;
  #pragma HLS STREAM variable=fifo_A_PE_11_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_6;
  #pragma HLS STREAM variable=fifo_A_PE_11_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_7;
  #pragma HLS STREAM variable=fifo_A_PE_11_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_11_8;
  #pragma HLS STREAM variable=fifo_A_PE_11_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_11_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_0;
  #pragma HLS STREAM variable=fifo_A_PE_12_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_1;
  #pragma HLS STREAM variable=fifo_A_PE_12_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_2;
  #pragma HLS STREAM variable=fifo_A_PE_12_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_3;
  #pragma HLS STREAM variable=fifo_A_PE_12_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_4;
  #pragma HLS STREAM variable=fifo_A_PE_12_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_5;
  #pragma HLS STREAM variable=fifo_A_PE_12_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_6;
  #pragma HLS STREAM variable=fifo_A_PE_12_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_7;
  #pragma HLS STREAM variable=fifo_A_PE_12_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_12_8;
  #pragma HLS STREAM variable=fifo_A_PE_12_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_12_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_0;
  #pragma HLS STREAM variable=fifo_A_PE_13_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_1;
  #pragma HLS STREAM variable=fifo_A_PE_13_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_2;
  #pragma HLS STREAM variable=fifo_A_PE_13_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_3;
  #pragma HLS STREAM variable=fifo_A_PE_13_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_4;
  #pragma HLS STREAM variable=fifo_A_PE_13_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_5;
  #pragma HLS STREAM variable=fifo_A_PE_13_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_6;
  #pragma HLS STREAM variable=fifo_A_PE_13_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_7;
  #pragma HLS STREAM variable=fifo_A_PE_13_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_13_8;
  #pragma HLS STREAM variable=fifo_A_PE_13_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_13_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_0;
  #pragma HLS STREAM variable=fifo_A_PE_14_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_1;
  #pragma HLS STREAM variable=fifo_A_PE_14_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_2;
  #pragma HLS STREAM variable=fifo_A_PE_14_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_3;
  #pragma HLS STREAM variable=fifo_A_PE_14_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_4;
  #pragma HLS STREAM variable=fifo_A_PE_14_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_5;
  #pragma HLS STREAM variable=fifo_A_PE_14_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_6;
  #pragma HLS STREAM variable=fifo_A_PE_14_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_7;
  #pragma HLS STREAM variable=fifo_A_PE_14_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_14_8;
  #pragma HLS STREAM variable=fifo_A_PE_14_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_14_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_0;
  #pragma HLS STREAM variable=fifo_A_PE_15_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_1;
  #pragma HLS STREAM variable=fifo_A_PE_15_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_2;
  #pragma HLS STREAM variable=fifo_A_PE_15_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_3;
  #pragma HLS STREAM variable=fifo_A_PE_15_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_4;
  #pragma HLS STREAM variable=fifo_A_PE_15_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_5;
  #pragma HLS STREAM variable=fifo_A_PE_15_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_6;
  #pragma HLS STREAM variable=fifo_A_PE_15_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_7;
  #pragma HLS STREAM variable=fifo_A_PE_15_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_15_8;
  #pragma HLS STREAM variable=fifo_A_PE_15_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_15_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_0;
  #pragma HLS STREAM variable=fifo_A_PE_16_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_1;
  #pragma HLS STREAM variable=fifo_A_PE_16_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_2;
  #pragma HLS STREAM variable=fifo_A_PE_16_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_3;
  #pragma HLS STREAM variable=fifo_A_PE_16_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_4;
  #pragma HLS STREAM variable=fifo_A_PE_16_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_5;
  #pragma HLS STREAM variable=fifo_A_PE_16_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_6;
  #pragma HLS STREAM variable=fifo_A_PE_16_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_7;
  #pragma HLS STREAM variable=fifo_A_PE_16_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_16_8;
  #pragma HLS STREAM variable=fifo_A_PE_16_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_16_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_0;
  #pragma HLS STREAM variable=fifo_A_PE_17_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_1;
  #pragma HLS STREAM variable=fifo_A_PE_17_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_2;
  #pragma HLS STREAM variable=fifo_A_PE_17_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_3;
  #pragma HLS STREAM variable=fifo_A_PE_17_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_4;
  #pragma HLS STREAM variable=fifo_A_PE_17_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_5;
  #pragma HLS STREAM variable=fifo_A_PE_17_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_6;
  #pragma HLS STREAM variable=fifo_A_PE_17_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_7;
  #pragma HLS STREAM variable=fifo_A_PE_17_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_17_8;
  #pragma HLS STREAM variable=fifo_A_PE_17_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_17_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_0;
  #pragma HLS STREAM variable=fifo_A_PE_18_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_1;
  #pragma HLS STREAM variable=fifo_A_PE_18_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_2;
  #pragma HLS STREAM variable=fifo_A_PE_18_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_3;
  #pragma HLS STREAM variable=fifo_A_PE_18_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_4;
  #pragma HLS STREAM variable=fifo_A_PE_18_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_5;
  #pragma HLS STREAM variable=fifo_A_PE_18_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_6;
  #pragma HLS STREAM variable=fifo_A_PE_18_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_7;
  #pragma HLS STREAM variable=fifo_A_PE_18_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_18_8;
  #pragma HLS STREAM variable=fifo_A_PE_18_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_18_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_0;
  #pragma HLS STREAM variable=fifo_A_PE_19_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_1;
  #pragma HLS STREAM variable=fifo_A_PE_19_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_2;
  #pragma HLS STREAM variable=fifo_A_PE_19_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_3;
  #pragma HLS STREAM variable=fifo_A_PE_19_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_4;
  #pragma HLS STREAM variable=fifo_A_PE_19_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_5;
  #pragma HLS STREAM variable=fifo_A_PE_19_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_6;
  #pragma HLS STREAM variable=fifo_A_PE_19_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_7;
  #pragma HLS STREAM variable=fifo_A_PE_19_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_19_8;
  #pragma HLS STREAM variable=fifo_A_PE_19_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_19_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_0;
  #pragma HLS STREAM variable=fifo_A_PE_20_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_1;
  #pragma HLS STREAM variable=fifo_A_PE_20_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_2;
  #pragma HLS STREAM variable=fifo_A_PE_20_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_3;
  #pragma HLS STREAM variable=fifo_A_PE_20_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_4;
  #pragma HLS STREAM variable=fifo_A_PE_20_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_5;
  #pragma HLS STREAM variable=fifo_A_PE_20_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_6;
  #pragma HLS STREAM variable=fifo_A_PE_20_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_7;
  #pragma HLS STREAM variable=fifo_A_PE_20_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_20_8;
  #pragma HLS STREAM variable=fifo_A_PE_20_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_20_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_0;
  #pragma HLS STREAM variable=fifo_A_PE_21_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_1;
  #pragma HLS STREAM variable=fifo_A_PE_21_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_2;
  #pragma HLS STREAM variable=fifo_A_PE_21_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_3;
  #pragma HLS STREAM variable=fifo_A_PE_21_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_4;
  #pragma HLS STREAM variable=fifo_A_PE_21_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_5;
  #pragma HLS STREAM variable=fifo_A_PE_21_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_6;
  #pragma HLS STREAM variable=fifo_A_PE_21_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_7;
  #pragma HLS STREAM variable=fifo_A_PE_21_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_21_8;
  #pragma HLS STREAM variable=fifo_A_PE_21_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_21_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_0;
  #pragma HLS STREAM variable=fifo_A_PE_22_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_1;
  #pragma HLS STREAM variable=fifo_A_PE_22_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_2;
  #pragma HLS STREAM variable=fifo_A_PE_22_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_3;
  #pragma HLS STREAM variable=fifo_A_PE_22_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_4;
  #pragma HLS STREAM variable=fifo_A_PE_22_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_5;
  #pragma HLS STREAM variable=fifo_A_PE_22_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_6;
  #pragma HLS STREAM variable=fifo_A_PE_22_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_7;
  #pragma HLS STREAM variable=fifo_A_PE_22_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_22_8;
  #pragma HLS STREAM variable=fifo_A_PE_22_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_22_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_0;
  #pragma HLS STREAM variable=fifo_A_PE_23_0 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_1;
  #pragma HLS STREAM variable=fifo_A_PE_23_1 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_2;
  #pragma HLS STREAM variable=fifo_A_PE_23_2 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_3;
  #pragma HLS STREAM variable=fifo_A_PE_23_3 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_4;
  #pragma HLS STREAM variable=fifo_A_PE_23_4 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_5;
  #pragma HLS STREAM variable=fifo_A_PE_23_5 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_6;
  #pragma HLS STREAM variable=fifo_A_PE_23_6 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_7;
  #pragma HLS STREAM variable=fifo_A_PE_23_7 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<A_t64> fifo_A_PE_23_8;
  #pragma HLS STREAM variable=fifo_A_PE_23_8 depth=2
  #pragma HLS RESOURCE variable=fifo_A_PE_23_8 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_0;
  #pragma HLS STREAM variable=fifo_B_PE_0_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_0;
  #pragma HLS STREAM variable=fifo_B_PE_1_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_0;
  #pragma HLS STREAM variable=fifo_B_PE_2_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_0;
  #pragma HLS STREAM variable=fifo_B_PE_3_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_0;
  #pragma HLS STREAM variable=fifo_B_PE_4_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_0;
  #pragma HLS STREAM variable=fifo_B_PE_5_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_0;
  #pragma HLS STREAM variable=fifo_B_PE_6_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_0;
  #pragma HLS STREAM variable=fifo_B_PE_7_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_0;
  #pragma HLS STREAM variable=fifo_B_PE_8_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_0;
  #pragma HLS STREAM variable=fifo_B_PE_9_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_0;
  #pragma HLS STREAM variable=fifo_B_PE_10_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_0;
  #pragma HLS STREAM variable=fifo_B_PE_11_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_0;
  #pragma HLS STREAM variable=fifo_B_PE_12_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_0;
  #pragma HLS STREAM variable=fifo_B_PE_13_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_0;
  #pragma HLS STREAM variable=fifo_B_PE_14_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_0;
  #pragma HLS STREAM variable=fifo_B_PE_15_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_0;
  #pragma HLS STREAM variable=fifo_B_PE_16_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_0;
  #pragma HLS STREAM variable=fifo_B_PE_17_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_0;
  #pragma HLS STREAM variable=fifo_B_PE_18_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_0;
  #pragma HLS STREAM variable=fifo_B_PE_19_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_0;
  #pragma HLS STREAM variable=fifo_B_PE_20_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_0;
  #pragma HLS STREAM variable=fifo_B_PE_21_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_0;
  #pragma HLS STREAM variable=fifo_B_PE_22_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_0;
  #pragma HLS STREAM variable=fifo_B_PE_23_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_0;
  #pragma HLS STREAM variable=fifo_B_PE_24_0 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_1;
  #pragma HLS STREAM variable=fifo_B_PE_0_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_1;
  #pragma HLS STREAM variable=fifo_B_PE_1_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_1;
  #pragma HLS STREAM variable=fifo_B_PE_2_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_1;
  #pragma HLS STREAM variable=fifo_B_PE_3_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_1;
  #pragma HLS STREAM variable=fifo_B_PE_4_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_1;
  #pragma HLS STREAM variable=fifo_B_PE_5_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_1;
  #pragma HLS STREAM variable=fifo_B_PE_6_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_1;
  #pragma HLS STREAM variable=fifo_B_PE_7_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_1;
  #pragma HLS STREAM variable=fifo_B_PE_8_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_1;
  #pragma HLS STREAM variable=fifo_B_PE_9_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_1;
  #pragma HLS STREAM variable=fifo_B_PE_10_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_1;
  #pragma HLS STREAM variable=fifo_B_PE_11_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_1;
  #pragma HLS STREAM variable=fifo_B_PE_12_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_1;
  #pragma HLS STREAM variable=fifo_B_PE_13_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_1;
  #pragma HLS STREAM variable=fifo_B_PE_14_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_1;
  #pragma HLS STREAM variable=fifo_B_PE_15_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_1;
  #pragma HLS STREAM variable=fifo_B_PE_16_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_1;
  #pragma HLS STREAM variable=fifo_B_PE_17_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_1;
  #pragma HLS STREAM variable=fifo_B_PE_18_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_1;
  #pragma HLS STREAM variable=fifo_B_PE_19_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_1;
  #pragma HLS STREAM variable=fifo_B_PE_20_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_1;
  #pragma HLS STREAM variable=fifo_B_PE_21_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_1;
  #pragma HLS STREAM variable=fifo_B_PE_22_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_1;
  #pragma HLS STREAM variable=fifo_B_PE_23_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_1;
  #pragma HLS STREAM variable=fifo_B_PE_24_1 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_2;
  #pragma HLS STREAM variable=fifo_B_PE_0_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_2;
  #pragma HLS STREAM variable=fifo_B_PE_1_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_2;
  #pragma HLS STREAM variable=fifo_B_PE_2_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_2;
  #pragma HLS STREAM variable=fifo_B_PE_3_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_2;
  #pragma HLS STREAM variable=fifo_B_PE_4_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_2;
  #pragma HLS STREAM variable=fifo_B_PE_5_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_2;
  #pragma HLS STREAM variable=fifo_B_PE_6_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_2;
  #pragma HLS STREAM variable=fifo_B_PE_7_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_2;
  #pragma HLS STREAM variable=fifo_B_PE_8_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_2;
  #pragma HLS STREAM variable=fifo_B_PE_9_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_2;
  #pragma HLS STREAM variable=fifo_B_PE_10_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_2;
  #pragma HLS STREAM variable=fifo_B_PE_11_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_2;
  #pragma HLS STREAM variable=fifo_B_PE_12_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_2;
  #pragma HLS STREAM variable=fifo_B_PE_13_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_2;
  #pragma HLS STREAM variable=fifo_B_PE_14_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_2;
  #pragma HLS STREAM variable=fifo_B_PE_15_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_2;
  #pragma HLS STREAM variable=fifo_B_PE_16_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_2;
  #pragma HLS STREAM variable=fifo_B_PE_17_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_2;
  #pragma HLS STREAM variable=fifo_B_PE_18_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_2;
  #pragma HLS STREAM variable=fifo_B_PE_19_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_2;
  #pragma HLS STREAM variable=fifo_B_PE_20_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_2;
  #pragma HLS STREAM variable=fifo_B_PE_21_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_2;
  #pragma HLS STREAM variable=fifo_B_PE_22_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_2;
  #pragma HLS STREAM variable=fifo_B_PE_23_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_2;
  #pragma HLS STREAM variable=fifo_B_PE_24_2 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_3;
  #pragma HLS STREAM variable=fifo_B_PE_0_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_3;
  #pragma HLS STREAM variable=fifo_B_PE_1_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_3;
  #pragma HLS STREAM variable=fifo_B_PE_2_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_3;
  #pragma HLS STREAM variable=fifo_B_PE_3_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_3;
  #pragma HLS STREAM variable=fifo_B_PE_4_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_3;
  #pragma HLS STREAM variable=fifo_B_PE_5_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_3;
  #pragma HLS STREAM variable=fifo_B_PE_6_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_3;
  #pragma HLS STREAM variable=fifo_B_PE_7_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_3;
  #pragma HLS STREAM variable=fifo_B_PE_8_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_3;
  #pragma HLS STREAM variable=fifo_B_PE_9_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_3;
  #pragma HLS STREAM variable=fifo_B_PE_10_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_3;
  #pragma HLS STREAM variable=fifo_B_PE_11_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_3;
  #pragma HLS STREAM variable=fifo_B_PE_12_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_3;
  #pragma HLS STREAM variable=fifo_B_PE_13_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_3;
  #pragma HLS STREAM variable=fifo_B_PE_14_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_3;
  #pragma HLS STREAM variable=fifo_B_PE_15_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_3;
  #pragma HLS STREAM variable=fifo_B_PE_16_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_3;
  #pragma HLS STREAM variable=fifo_B_PE_17_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_3;
  #pragma HLS STREAM variable=fifo_B_PE_18_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_3;
  #pragma HLS STREAM variable=fifo_B_PE_19_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_3;
  #pragma HLS STREAM variable=fifo_B_PE_20_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_3;
  #pragma HLS STREAM variable=fifo_B_PE_21_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_3;
  #pragma HLS STREAM variable=fifo_B_PE_22_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_3;
  #pragma HLS STREAM variable=fifo_B_PE_23_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_3;
  #pragma HLS STREAM variable=fifo_B_PE_24_3 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_4;
  #pragma HLS STREAM variable=fifo_B_PE_0_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_4;
  #pragma HLS STREAM variable=fifo_B_PE_1_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_4;
  #pragma HLS STREAM variable=fifo_B_PE_2_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_4;
  #pragma HLS STREAM variable=fifo_B_PE_3_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_4;
  #pragma HLS STREAM variable=fifo_B_PE_4_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_4;
  #pragma HLS STREAM variable=fifo_B_PE_5_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_4;
  #pragma HLS STREAM variable=fifo_B_PE_6_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_4;
  #pragma HLS STREAM variable=fifo_B_PE_7_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_4;
  #pragma HLS STREAM variable=fifo_B_PE_8_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_4;
  #pragma HLS STREAM variable=fifo_B_PE_9_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_4;
  #pragma HLS STREAM variable=fifo_B_PE_10_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_4;
  #pragma HLS STREAM variable=fifo_B_PE_11_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_4;
  #pragma HLS STREAM variable=fifo_B_PE_12_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_4;
  #pragma HLS STREAM variable=fifo_B_PE_13_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_4;
  #pragma HLS STREAM variable=fifo_B_PE_14_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_4;
  #pragma HLS STREAM variable=fifo_B_PE_15_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_4;
  #pragma HLS STREAM variable=fifo_B_PE_16_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_4;
  #pragma HLS STREAM variable=fifo_B_PE_17_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_4;
  #pragma HLS STREAM variable=fifo_B_PE_18_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_4;
  #pragma HLS STREAM variable=fifo_B_PE_19_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_4;
  #pragma HLS STREAM variable=fifo_B_PE_20_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_4;
  #pragma HLS STREAM variable=fifo_B_PE_21_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_4;
  #pragma HLS STREAM variable=fifo_B_PE_22_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_4;
  #pragma HLS STREAM variable=fifo_B_PE_23_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_4;
  #pragma HLS STREAM variable=fifo_B_PE_24_4 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_5;
  #pragma HLS STREAM variable=fifo_B_PE_0_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_5;
  #pragma HLS STREAM variable=fifo_B_PE_1_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_5;
  #pragma HLS STREAM variable=fifo_B_PE_2_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_5;
  #pragma HLS STREAM variable=fifo_B_PE_3_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_5;
  #pragma HLS STREAM variable=fifo_B_PE_4_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_5;
  #pragma HLS STREAM variable=fifo_B_PE_5_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_5;
  #pragma HLS STREAM variable=fifo_B_PE_6_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_5;
  #pragma HLS STREAM variable=fifo_B_PE_7_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_5;
  #pragma HLS STREAM variable=fifo_B_PE_8_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_5;
  #pragma HLS STREAM variable=fifo_B_PE_9_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_5;
  #pragma HLS STREAM variable=fifo_B_PE_10_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_5;
  #pragma HLS STREAM variable=fifo_B_PE_11_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_5;
  #pragma HLS STREAM variable=fifo_B_PE_12_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_5;
  #pragma HLS STREAM variable=fifo_B_PE_13_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_5;
  #pragma HLS STREAM variable=fifo_B_PE_14_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_5;
  #pragma HLS STREAM variable=fifo_B_PE_15_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_5;
  #pragma HLS STREAM variable=fifo_B_PE_16_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_5;
  #pragma HLS STREAM variable=fifo_B_PE_17_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_5;
  #pragma HLS STREAM variable=fifo_B_PE_18_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_5;
  #pragma HLS STREAM variable=fifo_B_PE_19_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_5;
  #pragma HLS STREAM variable=fifo_B_PE_20_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_5;
  #pragma HLS STREAM variable=fifo_B_PE_21_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_5;
  #pragma HLS STREAM variable=fifo_B_PE_22_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_5;
  #pragma HLS STREAM variable=fifo_B_PE_23_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_5;
  #pragma HLS STREAM variable=fifo_B_PE_24_5 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_6;
  #pragma HLS STREAM variable=fifo_B_PE_0_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_6;
  #pragma HLS STREAM variable=fifo_B_PE_1_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_6;
  #pragma HLS STREAM variable=fifo_B_PE_2_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_6;
  #pragma HLS STREAM variable=fifo_B_PE_3_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_6;
  #pragma HLS STREAM variable=fifo_B_PE_4_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_6;
  #pragma HLS STREAM variable=fifo_B_PE_5_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_6;
  #pragma HLS STREAM variable=fifo_B_PE_6_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_6;
  #pragma HLS STREAM variable=fifo_B_PE_7_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_6;
  #pragma HLS STREAM variable=fifo_B_PE_8_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_6;
  #pragma HLS STREAM variable=fifo_B_PE_9_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_6;
  #pragma HLS STREAM variable=fifo_B_PE_10_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_6;
  #pragma HLS STREAM variable=fifo_B_PE_11_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_6;
  #pragma HLS STREAM variable=fifo_B_PE_12_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_6;
  #pragma HLS STREAM variable=fifo_B_PE_13_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_6;
  #pragma HLS STREAM variable=fifo_B_PE_14_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_6;
  #pragma HLS STREAM variable=fifo_B_PE_15_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_6;
  #pragma HLS STREAM variable=fifo_B_PE_16_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_6;
  #pragma HLS STREAM variable=fifo_B_PE_17_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_6;
  #pragma HLS STREAM variable=fifo_B_PE_18_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_6;
  #pragma HLS STREAM variable=fifo_B_PE_19_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_6;
  #pragma HLS STREAM variable=fifo_B_PE_20_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_6;
  #pragma HLS STREAM variable=fifo_B_PE_21_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_6;
  #pragma HLS STREAM variable=fifo_B_PE_22_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_6;
  #pragma HLS STREAM variable=fifo_B_PE_23_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_6;
  #pragma HLS STREAM variable=fifo_B_PE_24_6 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_0_7;
  #pragma HLS STREAM variable=fifo_B_PE_0_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_0_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_1_7;
  #pragma HLS STREAM variable=fifo_B_PE_1_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_1_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_2_7;
  #pragma HLS STREAM variable=fifo_B_PE_2_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_2_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_3_7;
  #pragma HLS STREAM variable=fifo_B_PE_3_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_3_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_4_7;
  #pragma HLS STREAM variable=fifo_B_PE_4_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_4_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_5_7;
  #pragma HLS STREAM variable=fifo_B_PE_5_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_5_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_6_7;
  #pragma HLS STREAM variable=fifo_B_PE_6_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_6_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_7_7;
  #pragma HLS STREAM variable=fifo_B_PE_7_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_7_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_8_7;
  #pragma HLS STREAM variable=fifo_B_PE_8_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_8_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_9_7;
  #pragma HLS STREAM variable=fifo_B_PE_9_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_9_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_10_7;
  #pragma HLS STREAM variable=fifo_B_PE_10_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_10_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_11_7;
  #pragma HLS STREAM variable=fifo_B_PE_11_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_11_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_12_7;
  #pragma HLS STREAM variable=fifo_B_PE_12_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_12_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_13_7;
  #pragma HLS STREAM variable=fifo_B_PE_13_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_13_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_14_7;
  #pragma HLS STREAM variable=fifo_B_PE_14_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_14_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_15_7;
  #pragma HLS STREAM variable=fifo_B_PE_15_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_15_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_16_7;
  #pragma HLS STREAM variable=fifo_B_PE_16_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_16_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_17_7;
  #pragma HLS STREAM variable=fifo_B_PE_17_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_17_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_18_7;
  #pragma HLS STREAM variable=fifo_B_PE_18_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_18_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_19_7;
  #pragma HLS STREAM variable=fifo_B_PE_19_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_19_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_20_7;
  #pragma HLS STREAM variable=fifo_B_PE_20_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_20_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_21_7;
  #pragma HLS STREAM variable=fifo_B_PE_21_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_21_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_22_7;
  #pragma HLS STREAM variable=fifo_B_PE_22_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_22_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_23_7;
  #pragma HLS STREAM variable=fifo_B_PE_23_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_23_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<B_t64> fifo_B_PE_24_7;
  #pragma HLS STREAM variable=fifo_B_PE_24_7 depth=2
  #pragma HLS RESOURCE variable=fifo_B_PE_24_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_0;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_0 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_1;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_1 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_2;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_2 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_3;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_3 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_4;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_4 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_5;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_5 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_6;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_6 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_0_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_0_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_0_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_1_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_1_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_1_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_2_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_2_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_2_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_3_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_3_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_3_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_4_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_4_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_4_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_5_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_5_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_5_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_6_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_6_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_6_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_7_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_7_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_7_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_8_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_8_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_8_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_9_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_9_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_9_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_10_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_10_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_10_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_11_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_11_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_11_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_12_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_12_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_12_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_13_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_13_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_13_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_14_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_14_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_14_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_15_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_15_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_15_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_16_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_16_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_16_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_17_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_17_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_17_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_18_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_18_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_18_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_19_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_19_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_19_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_20_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_20_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_20_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_21_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_21_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_21_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_22_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_22_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_22_7 core=FIFO_SRL
  /* PE fifo */ hls::stream<char> fifo_C_drain_PE_23_7;
  #pragma HLS STREAM variable=fifo_C_drain_PE_23_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_PE_23_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_0_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_0_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_0_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_1_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_1_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_1_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_2_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_2_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_2_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_3_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_3_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_3_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_4_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_4_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_4_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_5_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_5_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_5_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_6_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_6_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_6_24 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_0 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_1 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_2 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_3 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_4 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_5 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_6 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_7 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_8 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_9;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_9 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_9 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_10;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_10 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_10 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_11;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_11 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_11 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_12;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_12 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_12 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_13;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_13 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_13 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_14;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_14 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_14 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_15;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_15 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_15 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_16;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_16 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_16 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_17;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_17 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_17 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_18;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_18 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_18 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_19;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_19 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_19 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_20;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_20 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_20 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_21;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_21 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_21 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_22;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_22 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_22 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_23;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_23 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_23 core=FIFO_SRL
  /* C_drain_IO_L1_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L1_out_7_24;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L1_out_7_24 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L1_out_7_24 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_0;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_0 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_0 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_1;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_1 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_1 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_2;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_2 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_2 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_3;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_3 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_3 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_4;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_4 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_4 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_5;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_5 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_5 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_6;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_6 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_6 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_7;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_7 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_7 core=FIFO_SRL
  /* C_drain_IO_L2_out fifo */ hls::stream<C_t32> fifo_C_drain_C_drain_IO_L2_out_8;
  #pragma HLS STREAM variable=fifo_C_drain_C_drain_IO_L2_out_8 depth=2
  #pragma HLS RESOURCE variable=fifo_C_drain_C_drain_IO_L2_out_8 core=FIFO_SRL
  /* FIFO Declaration */

  /* Module Call */
  A_IO_L3_in_serialize(
    /* array */ A,
    /* fifo */ fifo_A_A_IO_L3_in_serialize
  );
  /* Module Call */

  /* Module Call */
  A_IO_L3_in(
    /* fifo */ fifo_A_A_IO_L3_in_serialize,
    /* fifo */ fifo_A_A_IO_L2_in_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 0,
    /* fifo */ fifo_A_A_IO_L2_in_0,
    /* fifo */ fifo_A_A_IO_L2_in_1,
    /* fifo */ fifo_A_PE_0_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 1,
    /* fifo */ fifo_A_A_IO_L2_in_1,
    /* fifo */ fifo_A_A_IO_L2_in_2,
    /* fifo */ fifo_A_PE_1_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 2,
    /* fifo */ fifo_A_A_IO_L2_in_2,
    /* fifo */ fifo_A_A_IO_L2_in_3,
    /* fifo */ fifo_A_PE_2_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 3,
    /* fifo */ fifo_A_A_IO_L2_in_3,
    /* fifo */ fifo_A_A_IO_L2_in_4,
    /* fifo */ fifo_A_PE_3_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 4,
    /* fifo */ fifo_A_A_IO_L2_in_4,
    /* fifo */ fifo_A_A_IO_L2_in_5,
    /* fifo */ fifo_A_PE_4_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 5,
    /* fifo */ fifo_A_A_IO_L2_in_5,
    /* fifo */ fifo_A_A_IO_L2_in_6,
    /* fifo */ fifo_A_PE_5_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 6,
    /* fifo */ fifo_A_A_IO_L2_in_6,
    /* fifo */ fifo_A_A_IO_L2_in_7,
    /* fifo */ fifo_A_PE_6_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 7,
    /* fifo */ fifo_A_A_IO_L2_in_7,
    /* fifo */ fifo_A_A_IO_L2_in_8,
    /* fifo */ fifo_A_PE_7_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 8,
    /* fifo */ fifo_A_A_IO_L2_in_8,
    /* fifo */ fifo_A_A_IO_L2_in_9,
    /* fifo */ fifo_A_PE_8_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 9,
    /* fifo */ fifo_A_A_IO_L2_in_9,
    /* fifo */ fifo_A_A_IO_L2_in_10,
    /* fifo */ fifo_A_PE_9_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 10,
    /* fifo */ fifo_A_A_IO_L2_in_10,
    /* fifo */ fifo_A_A_IO_L2_in_11,
    /* fifo */ fifo_A_PE_10_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 11,
    /* fifo */ fifo_A_A_IO_L2_in_11,
    /* fifo */ fifo_A_A_IO_L2_in_12,
    /* fifo */ fifo_A_PE_11_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 12,
    /* fifo */ fifo_A_A_IO_L2_in_12,
    /* fifo */ fifo_A_A_IO_L2_in_13,
    /* fifo */ fifo_A_PE_12_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 13,
    /* fifo */ fifo_A_A_IO_L2_in_13,
    /* fifo */ fifo_A_A_IO_L2_in_14,
    /* fifo */ fifo_A_PE_13_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 14,
    /* fifo */ fifo_A_A_IO_L2_in_14,
    /* fifo */ fifo_A_A_IO_L2_in_15,
    /* fifo */ fifo_A_PE_14_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 15,
    /* fifo */ fifo_A_A_IO_L2_in_15,
    /* fifo */ fifo_A_A_IO_L2_in_16,
    /* fifo */ fifo_A_PE_15_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 16,
    /* fifo */ fifo_A_A_IO_L2_in_16,
    /* fifo */ fifo_A_A_IO_L2_in_17,
    /* fifo */ fifo_A_PE_16_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 17,
    /* fifo */ fifo_A_A_IO_L2_in_17,
    /* fifo */ fifo_A_A_IO_L2_in_18,
    /* fifo */ fifo_A_PE_17_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 18,
    /* fifo */ fifo_A_A_IO_L2_in_18,
    /* fifo */ fifo_A_A_IO_L2_in_19,
    /* fifo */ fifo_A_PE_18_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 19,
    /* fifo */ fifo_A_A_IO_L2_in_19,
    /* fifo */ fifo_A_A_IO_L2_in_20,
    /* fifo */ fifo_A_PE_19_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 20,
    /* fifo */ fifo_A_A_IO_L2_in_20,
    /* fifo */ fifo_A_A_IO_L2_in_21,
    /* fifo */ fifo_A_PE_20_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 21,
    /* fifo */ fifo_A_A_IO_L2_in_21,
    /* fifo */ fifo_A_A_IO_L2_in_22,
    /* fifo */ fifo_A_PE_21_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in(
    /* module id */ 22,
    /* fifo */ fifo_A_A_IO_L2_in_22,
    /* fifo */ fifo_A_A_IO_L2_in_23,
    /* fifo */ fifo_A_PE_22_0
  );
  /* Module Call */

  /* Module Call */
  A_IO_L2_in_boundary(
    /* module id */ 23,
    /* fifo */ fifo_A_A_IO_L2_in_23,
    /* fifo */ fifo_A_PE_23_0
  );
  /* Module Call */

  /* Module Call */
  B_IO_L3_in_serialize(
    /* array */ B,
    /* fifo */ fifo_B_B_IO_L3_in_serialize
  );
  /* Module Call */

  /* Module Call */
  B_IO_L3_in(
    /* fifo */ fifo_B_B_IO_L3_in_serialize,
    /* fifo */ fifo_B_B_IO_L2_in_0
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 0,
    /* fifo */ fifo_B_B_IO_L2_in_0,
    /* fifo */ fifo_B_B_IO_L2_in_1,
    /* fifo */ fifo_B_PE_0_0
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 1,
    /* fifo */ fifo_B_B_IO_L2_in_1,
    /* fifo */ fifo_B_B_IO_L2_in_2,
    /* fifo */ fifo_B_PE_0_1
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 2,
    /* fifo */ fifo_B_B_IO_L2_in_2,
    /* fifo */ fifo_B_B_IO_L2_in_3,
    /* fifo */ fifo_B_PE_0_2
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 3,
    /* fifo */ fifo_B_B_IO_L2_in_3,
    /* fifo */ fifo_B_B_IO_L2_in_4,
    /* fifo */ fifo_B_PE_0_3
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 4,
    /* fifo */ fifo_B_B_IO_L2_in_4,
    /* fifo */ fifo_B_B_IO_L2_in_5,
    /* fifo */ fifo_B_PE_0_4
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 5,
    /* fifo */ fifo_B_B_IO_L2_in_5,
    /* fifo */ fifo_B_B_IO_L2_in_6,
    /* fifo */ fifo_B_PE_0_5
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in(
    /* module id */ 6,
    /* fifo */ fifo_B_B_IO_L2_in_6,
    /* fifo */ fifo_B_B_IO_L2_in_7,
    /* fifo */ fifo_B_PE_0_6
  );
  /* Module Call */

  /* Module Call */
  B_IO_L2_in_boundary(
    /* module id */ 7,
    /* fifo */ fifo_B_B_IO_L2_in_7,
    /* fifo */ fifo_B_PE_0_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_0_0,
    /* fifo */ fifo_A_PE_0_1,
    /* fifo */ fifo_B_PE_0_0,
    /* fifo */ fifo_B_PE_1_0,
    /* fifo */ fifo_C_drain_PE_0_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_0_1,
    /* fifo */ fifo_A_PE_0_2,
    /* fifo */ fifo_B_PE_0_1,
    /* fifo */ fifo_B_PE_1_1,
    /* fifo */ fifo_C_drain_PE_0_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_0_2,
    /* fifo */ fifo_A_PE_0_3,
    /* fifo */ fifo_B_PE_0_2,
    /* fifo */ fifo_B_PE_1_2,
    /* fifo */ fifo_C_drain_PE_0_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_0_3,
    /* fifo */ fifo_A_PE_0_4,
    /* fifo */ fifo_B_PE_0_3,
    /* fifo */ fifo_B_PE_1_3,
    /* fifo */ fifo_C_drain_PE_0_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_0_4,
    /* fifo */ fifo_A_PE_0_5,
    /* fifo */ fifo_B_PE_0_4,
    /* fifo */ fifo_B_PE_1_4,
    /* fifo */ fifo_C_drain_PE_0_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_0_5,
    /* fifo */ fifo_A_PE_0_6,
    /* fifo */ fifo_B_PE_0_5,
    /* fifo */ fifo_B_PE_1_5,
    /* fifo */ fifo_C_drain_PE_0_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_0_6,
    /* fifo */ fifo_A_PE_0_7,
    /* fifo */ fifo_B_PE_0_6,
    /* fifo */ fifo_B_PE_1_6,
    /* fifo */ fifo_C_drain_PE_0_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 0,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_0_7,
    /* fifo */ fifo_A_PE_0_8,
    /* fifo */ fifo_B_PE_0_7,
    /* fifo */ fifo_B_PE_1_7,
    /* fifo */ fifo_C_drain_PE_0_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_1_0,
    /* fifo */ fifo_A_PE_1_1,
    /* fifo */ fifo_B_PE_1_0,
    /* fifo */ fifo_B_PE_2_0,
    /* fifo */ fifo_C_drain_PE_1_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_1_1,
    /* fifo */ fifo_A_PE_1_2,
    /* fifo */ fifo_B_PE_1_1,
    /* fifo */ fifo_B_PE_2_1,
    /* fifo */ fifo_C_drain_PE_1_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_1_2,
    /* fifo */ fifo_A_PE_1_3,
    /* fifo */ fifo_B_PE_1_2,
    /* fifo */ fifo_B_PE_2_2,
    /* fifo */ fifo_C_drain_PE_1_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_1_3,
    /* fifo */ fifo_A_PE_1_4,
    /* fifo */ fifo_B_PE_1_3,
    /* fifo */ fifo_B_PE_2_3,
    /* fifo */ fifo_C_drain_PE_1_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_1_4,
    /* fifo */ fifo_A_PE_1_5,
    /* fifo */ fifo_B_PE_1_4,
    /* fifo */ fifo_B_PE_2_4,
    /* fifo */ fifo_C_drain_PE_1_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_1_5,
    /* fifo */ fifo_A_PE_1_6,
    /* fifo */ fifo_B_PE_1_5,
    /* fifo */ fifo_B_PE_2_5,
    /* fifo */ fifo_C_drain_PE_1_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_1_6,
    /* fifo */ fifo_A_PE_1_7,
    /* fifo */ fifo_B_PE_1_6,
    /* fifo */ fifo_B_PE_2_6,
    /* fifo */ fifo_C_drain_PE_1_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 1,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_1_7,
    /* fifo */ fifo_A_PE_1_8,
    /* fifo */ fifo_B_PE_1_7,
    /* fifo */ fifo_B_PE_2_7,
    /* fifo */ fifo_C_drain_PE_1_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_2_0,
    /* fifo */ fifo_A_PE_2_1,
    /* fifo */ fifo_B_PE_2_0,
    /* fifo */ fifo_B_PE_3_0,
    /* fifo */ fifo_C_drain_PE_2_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_2_1,
    /* fifo */ fifo_A_PE_2_2,
    /* fifo */ fifo_B_PE_2_1,
    /* fifo */ fifo_B_PE_3_1,
    /* fifo */ fifo_C_drain_PE_2_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_2_2,
    /* fifo */ fifo_A_PE_2_3,
    /* fifo */ fifo_B_PE_2_2,
    /* fifo */ fifo_B_PE_3_2,
    /* fifo */ fifo_C_drain_PE_2_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_2_3,
    /* fifo */ fifo_A_PE_2_4,
    /* fifo */ fifo_B_PE_2_3,
    /* fifo */ fifo_B_PE_3_3,
    /* fifo */ fifo_C_drain_PE_2_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_2_4,
    /* fifo */ fifo_A_PE_2_5,
    /* fifo */ fifo_B_PE_2_4,
    /* fifo */ fifo_B_PE_3_4,
    /* fifo */ fifo_C_drain_PE_2_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_2_5,
    /* fifo */ fifo_A_PE_2_6,
    /* fifo */ fifo_B_PE_2_5,
    /* fifo */ fifo_B_PE_3_5,
    /* fifo */ fifo_C_drain_PE_2_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_2_6,
    /* fifo */ fifo_A_PE_2_7,
    /* fifo */ fifo_B_PE_2_6,
    /* fifo */ fifo_B_PE_3_6,
    /* fifo */ fifo_C_drain_PE_2_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 2,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_2_7,
    /* fifo */ fifo_A_PE_2_8,
    /* fifo */ fifo_B_PE_2_7,
    /* fifo */ fifo_B_PE_3_7,
    /* fifo */ fifo_C_drain_PE_2_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_3_0,
    /* fifo */ fifo_A_PE_3_1,
    /* fifo */ fifo_B_PE_3_0,
    /* fifo */ fifo_B_PE_4_0,
    /* fifo */ fifo_C_drain_PE_3_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_3_1,
    /* fifo */ fifo_A_PE_3_2,
    /* fifo */ fifo_B_PE_3_1,
    /* fifo */ fifo_B_PE_4_1,
    /* fifo */ fifo_C_drain_PE_3_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_3_2,
    /* fifo */ fifo_A_PE_3_3,
    /* fifo */ fifo_B_PE_3_2,
    /* fifo */ fifo_B_PE_4_2,
    /* fifo */ fifo_C_drain_PE_3_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_3_3,
    /* fifo */ fifo_A_PE_3_4,
    /* fifo */ fifo_B_PE_3_3,
    /* fifo */ fifo_B_PE_4_3,
    /* fifo */ fifo_C_drain_PE_3_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_3_4,
    /* fifo */ fifo_A_PE_3_5,
    /* fifo */ fifo_B_PE_3_4,
    /* fifo */ fifo_B_PE_4_4,
    /* fifo */ fifo_C_drain_PE_3_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_3_5,
    /* fifo */ fifo_A_PE_3_6,
    /* fifo */ fifo_B_PE_3_5,
    /* fifo */ fifo_B_PE_4_5,
    /* fifo */ fifo_C_drain_PE_3_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_3_6,
    /* fifo */ fifo_A_PE_3_7,
    /* fifo */ fifo_B_PE_3_6,
    /* fifo */ fifo_B_PE_4_6,
    /* fifo */ fifo_C_drain_PE_3_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 3,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_3_7,
    /* fifo */ fifo_A_PE_3_8,
    /* fifo */ fifo_B_PE_3_7,
    /* fifo */ fifo_B_PE_4_7,
    /* fifo */ fifo_C_drain_PE_3_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_4_0,
    /* fifo */ fifo_A_PE_4_1,
    /* fifo */ fifo_B_PE_4_0,
    /* fifo */ fifo_B_PE_5_0,
    /* fifo */ fifo_C_drain_PE_4_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_4_1,
    /* fifo */ fifo_A_PE_4_2,
    /* fifo */ fifo_B_PE_4_1,
    /* fifo */ fifo_B_PE_5_1,
    /* fifo */ fifo_C_drain_PE_4_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_4_2,
    /* fifo */ fifo_A_PE_4_3,
    /* fifo */ fifo_B_PE_4_2,
    /* fifo */ fifo_B_PE_5_2,
    /* fifo */ fifo_C_drain_PE_4_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_4_3,
    /* fifo */ fifo_A_PE_4_4,
    /* fifo */ fifo_B_PE_4_3,
    /* fifo */ fifo_B_PE_5_3,
    /* fifo */ fifo_C_drain_PE_4_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_4_4,
    /* fifo */ fifo_A_PE_4_5,
    /* fifo */ fifo_B_PE_4_4,
    /* fifo */ fifo_B_PE_5_4,
    /* fifo */ fifo_C_drain_PE_4_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_4_5,
    /* fifo */ fifo_A_PE_4_6,
    /* fifo */ fifo_B_PE_4_5,
    /* fifo */ fifo_B_PE_5_5,
    /* fifo */ fifo_C_drain_PE_4_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_4_6,
    /* fifo */ fifo_A_PE_4_7,
    /* fifo */ fifo_B_PE_4_6,
    /* fifo */ fifo_B_PE_5_6,
    /* fifo */ fifo_C_drain_PE_4_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 4,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_4_7,
    /* fifo */ fifo_A_PE_4_8,
    /* fifo */ fifo_B_PE_4_7,
    /* fifo */ fifo_B_PE_5_7,
    /* fifo */ fifo_C_drain_PE_4_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_5_0,
    /* fifo */ fifo_A_PE_5_1,
    /* fifo */ fifo_B_PE_5_0,
    /* fifo */ fifo_B_PE_6_0,
    /* fifo */ fifo_C_drain_PE_5_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_5_1,
    /* fifo */ fifo_A_PE_5_2,
    /* fifo */ fifo_B_PE_5_1,
    /* fifo */ fifo_B_PE_6_1,
    /* fifo */ fifo_C_drain_PE_5_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_5_2,
    /* fifo */ fifo_A_PE_5_3,
    /* fifo */ fifo_B_PE_5_2,
    /* fifo */ fifo_B_PE_6_2,
    /* fifo */ fifo_C_drain_PE_5_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_5_3,
    /* fifo */ fifo_A_PE_5_4,
    /* fifo */ fifo_B_PE_5_3,
    /* fifo */ fifo_B_PE_6_3,
    /* fifo */ fifo_C_drain_PE_5_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_5_4,
    /* fifo */ fifo_A_PE_5_5,
    /* fifo */ fifo_B_PE_5_4,
    /* fifo */ fifo_B_PE_6_4,
    /* fifo */ fifo_C_drain_PE_5_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_5_5,
    /* fifo */ fifo_A_PE_5_6,
    /* fifo */ fifo_B_PE_5_5,
    /* fifo */ fifo_B_PE_6_5,
    /* fifo */ fifo_C_drain_PE_5_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_5_6,
    /* fifo */ fifo_A_PE_5_7,
    /* fifo */ fifo_B_PE_5_6,
    /* fifo */ fifo_B_PE_6_6,
    /* fifo */ fifo_C_drain_PE_5_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 5,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_5_7,
    /* fifo */ fifo_A_PE_5_8,
    /* fifo */ fifo_B_PE_5_7,
    /* fifo */ fifo_B_PE_6_7,
    /* fifo */ fifo_C_drain_PE_5_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_6_0,
    /* fifo */ fifo_A_PE_6_1,
    /* fifo */ fifo_B_PE_6_0,
    /* fifo */ fifo_B_PE_7_0,
    /* fifo */ fifo_C_drain_PE_6_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_6_1,
    /* fifo */ fifo_A_PE_6_2,
    /* fifo */ fifo_B_PE_6_1,
    /* fifo */ fifo_B_PE_7_1,
    /* fifo */ fifo_C_drain_PE_6_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_6_2,
    /* fifo */ fifo_A_PE_6_3,
    /* fifo */ fifo_B_PE_6_2,
    /* fifo */ fifo_B_PE_7_2,
    /* fifo */ fifo_C_drain_PE_6_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_6_3,
    /* fifo */ fifo_A_PE_6_4,
    /* fifo */ fifo_B_PE_6_3,
    /* fifo */ fifo_B_PE_7_3,
    /* fifo */ fifo_C_drain_PE_6_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_6_4,
    /* fifo */ fifo_A_PE_6_5,
    /* fifo */ fifo_B_PE_6_4,
    /* fifo */ fifo_B_PE_7_4,
    /* fifo */ fifo_C_drain_PE_6_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_6_5,
    /* fifo */ fifo_A_PE_6_6,
    /* fifo */ fifo_B_PE_6_5,
    /* fifo */ fifo_B_PE_7_5,
    /* fifo */ fifo_C_drain_PE_6_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_6_6,
    /* fifo */ fifo_A_PE_6_7,
    /* fifo */ fifo_B_PE_6_6,
    /* fifo */ fifo_B_PE_7_6,
    /* fifo */ fifo_C_drain_PE_6_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 6,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_6_7,
    /* fifo */ fifo_A_PE_6_8,
    /* fifo */ fifo_B_PE_6_7,
    /* fifo */ fifo_B_PE_7_7,
    /* fifo */ fifo_C_drain_PE_6_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_7_0,
    /* fifo */ fifo_A_PE_7_1,
    /* fifo */ fifo_B_PE_7_0,
    /* fifo */ fifo_B_PE_8_0,
    /* fifo */ fifo_C_drain_PE_7_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_7_1,
    /* fifo */ fifo_A_PE_7_2,
    /* fifo */ fifo_B_PE_7_1,
    /* fifo */ fifo_B_PE_8_1,
    /* fifo */ fifo_C_drain_PE_7_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_7_2,
    /* fifo */ fifo_A_PE_7_3,
    /* fifo */ fifo_B_PE_7_2,
    /* fifo */ fifo_B_PE_8_2,
    /* fifo */ fifo_C_drain_PE_7_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_7_3,
    /* fifo */ fifo_A_PE_7_4,
    /* fifo */ fifo_B_PE_7_3,
    /* fifo */ fifo_B_PE_8_3,
    /* fifo */ fifo_C_drain_PE_7_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_7_4,
    /* fifo */ fifo_A_PE_7_5,
    /* fifo */ fifo_B_PE_7_4,
    /* fifo */ fifo_B_PE_8_4,
    /* fifo */ fifo_C_drain_PE_7_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_7_5,
    /* fifo */ fifo_A_PE_7_6,
    /* fifo */ fifo_B_PE_7_5,
    /* fifo */ fifo_B_PE_8_5,
    /* fifo */ fifo_C_drain_PE_7_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_7_6,
    /* fifo */ fifo_A_PE_7_7,
    /* fifo */ fifo_B_PE_7_6,
    /* fifo */ fifo_B_PE_8_6,
    /* fifo */ fifo_C_drain_PE_7_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 7,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_7_7,
    /* fifo */ fifo_A_PE_7_8,
    /* fifo */ fifo_B_PE_7_7,
    /* fifo */ fifo_B_PE_8_7,
    /* fifo */ fifo_C_drain_PE_7_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_8_0,
    /* fifo */ fifo_A_PE_8_1,
    /* fifo */ fifo_B_PE_8_0,
    /* fifo */ fifo_B_PE_9_0,
    /* fifo */ fifo_C_drain_PE_8_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_8_1,
    /* fifo */ fifo_A_PE_8_2,
    /* fifo */ fifo_B_PE_8_1,
    /* fifo */ fifo_B_PE_9_1,
    /* fifo */ fifo_C_drain_PE_8_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_8_2,
    /* fifo */ fifo_A_PE_8_3,
    /* fifo */ fifo_B_PE_8_2,
    /* fifo */ fifo_B_PE_9_2,
    /* fifo */ fifo_C_drain_PE_8_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_8_3,
    /* fifo */ fifo_A_PE_8_4,
    /* fifo */ fifo_B_PE_8_3,
    /* fifo */ fifo_B_PE_9_3,
    /* fifo */ fifo_C_drain_PE_8_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_8_4,
    /* fifo */ fifo_A_PE_8_5,
    /* fifo */ fifo_B_PE_8_4,
    /* fifo */ fifo_B_PE_9_4,
    /* fifo */ fifo_C_drain_PE_8_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_8_5,
    /* fifo */ fifo_A_PE_8_6,
    /* fifo */ fifo_B_PE_8_5,
    /* fifo */ fifo_B_PE_9_5,
    /* fifo */ fifo_C_drain_PE_8_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_8_6,
    /* fifo */ fifo_A_PE_8_7,
    /* fifo */ fifo_B_PE_8_6,
    /* fifo */ fifo_B_PE_9_6,
    /* fifo */ fifo_C_drain_PE_8_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 8,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_8_7,
    /* fifo */ fifo_A_PE_8_8,
    /* fifo */ fifo_B_PE_8_7,
    /* fifo */ fifo_B_PE_9_7,
    /* fifo */ fifo_C_drain_PE_8_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_9_0,
    /* fifo */ fifo_A_PE_9_1,
    /* fifo */ fifo_B_PE_9_0,
    /* fifo */ fifo_B_PE_10_0,
    /* fifo */ fifo_C_drain_PE_9_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_9_1,
    /* fifo */ fifo_A_PE_9_2,
    /* fifo */ fifo_B_PE_9_1,
    /* fifo */ fifo_B_PE_10_1,
    /* fifo */ fifo_C_drain_PE_9_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_9_2,
    /* fifo */ fifo_A_PE_9_3,
    /* fifo */ fifo_B_PE_9_2,
    /* fifo */ fifo_B_PE_10_2,
    /* fifo */ fifo_C_drain_PE_9_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_9_3,
    /* fifo */ fifo_A_PE_9_4,
    /* fifo */ fifo_B_PE_9_3,
    /* fifo */ fifo_B_PE_10_3,
    /* fifo */ fifo_C_drain_PE_9_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_9_4,
    /* fifo */ fifo_A_PE_9_5,
    /* fifo */ fifo_B_PE_9_4,
    /* fifo */ fifo_B_PE_10_4,
    /* fifo */ fifo_C_drain_PE_9_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_9_5,
    /* fifo */ fifo_A_PE_9_6,
    /* fifo */ fifo_B_PE_9_5,
    /* fifo */ fifo_B_PE_10_5,
    /* fifo */ fifo_C_drain_PE_9_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_9_6,
    /* fifo */ fifo_A_PE_9_7,
    /* fifo */ fifo_B_PE_9_6,
    /* fifo */ fifo_B_PE_10_6,
    /* fifo */ fifo_C_drain_PE_9_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 9,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_9_7,
    /* fifo */ fifo_A_PE_9_8,
    /* fifo */ fifo_B_PE_9_7,
    /* fifo */ fifo_B_PE_10_7,
    /* fifo */ fifo_C_drain_PE_9_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_10_0,
    /* fifo */ fifo_A_PE_10_1,
    /* fifo */ fifo_B_PE_10_0,
    /* fifo */ fifo_B_PE_11_0,
    /* fifo */ fifo_C_drain_PE_10_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_10_1,
    /* fifo */ fifo_A_PE_10_2,
    /* fifo */ fifo_B_PE_10_1,
    /* fifo */ fifo_B_PE_11_1,
    /* fifo */ fifo_C_drain_PE_10_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_10_2,
    /* fifo */ fifo_A_PE_10_3,
    /* fifo */ fifo_B_PE_10_2,
    /* fifo */ fifo_B_PE_11_2,
    /* fifo */ fifo_C_drain_PE_10_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_10_3,
    /* fifo */ fifo_A_PE_10_4,
    /* fifo */ fifo_B_PE_10_3,
    /* fifo */ fifo_B_PE_11_3,
    /* fifo */ fifo_C_drain_PE_10_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_10_4,
    /* fifo */ fifo_A_PE_10_5,
    /* fifo */ fifo_B_PE_10_4,
    /* fifo */ fifo_B_PE_11_4,
    /* fifo */ fifo_C_drain_PE_10_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_10_5,
    /* fifo */ fifo_A_PE_10_6,
    /* fifo */ fifo_B_PE_10_5,
    /* fifo */ fifo_B_PE_11_5,
    /* fifo */ fifo_C_drain_PE_10_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_10_6,
    /* fifo */ fifo_A_PE_10_7,
    /* fifo */ fifo_B_PE_10_6,
    /* fifo */ fifo_B_PE_11_6,
    /* fifo */ fifo_C_drain_PE_10_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 10,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_10_7,
    /* fifo */ fifo_A_PE_10_8,
    /* fifo */ fifo_B_PE_10_7,
    /* fifo */ fifo_B_PE_11_7,
    /* fifo */ fifo_C_drain_PE_10_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_11_0,
    /* fifo */ fifo_A_PE_11_1,
    /* fifo */ fifo_B_PE_11_0,
    /* fifo */ fifo_B_PE_12_0,
    /* fifo */ fifo_C_drain_PE_11_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_11_1,
    /* fifo */ fifo_A_PE_11_2,
    /* fifo */ fifo_B_PE_11_1,
    /* fifo */ fifo_B_PE_12_1,
    /* fifo */ fifo_C_drain_PE_11_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_11_2,
    /* fifo */ fifo_A_PE_11_3,
    /* fifo */ fifo_B_PE_11_2,
    /* fifo */ fifo_B_PE_12_2,
    /* fifo */ fifo_C_drain_PE_11_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_11_3,
    /* fifo */ fifo_A_PE_11_4,
    /* fifo */ fifo_B_PE_11_3,
    /* fifo */ fifo_B_PE_12_3,
    /* fifo */ fifo_C_drain_PE_11_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_11_4,
    /* fifo */ fifo_A_PE_11_5,
    /* fifo */ fifo_B_PE_11_4,
    /* fifo */ fifo_B_PE_12_4,
    /* fifo */ fifo_C_drain_PE_11_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_11_5,
    /* fifo */ fifo_A_PE_11_6,
    /* fifo */ fifo_B_PE_11_5,
    /* fifo */ fifo_B_PE_12_5,
    /* fifo */ fifo_C_drain_PE_11_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_11_6,
    /* fifo */ fifo_A_PE_11_7,
    /* fifo */ fifo_B_PE_11_6,
    /* fifo */ fifo_B_PE_12_6,
    /* fifo */ fifo_C_drain_PE_11_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 11,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_11_7,
    /* fifo */ fifo_A_PE_11_8,
    /* fifo */ fifo_B_PE_11_7,
    /* fifo */ fifo_B_PE_12_7,
    /* fifo */ fifo_C_drain_PE_11_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_12_0,
    /* fifo */ fifo_A_PE_12_1,
    /* fifo */ fifo_B_PE_12_0,
    /* fifo */ fifo_B_PE_13_0,
    /* fifo */ fifo_C_drain_PE_12_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_12_1,
    /* fifo */ fifo_A_PE_12_2,
    /* fifo */ fifo_B_PE_12_1,
    /* fifo */ fifo_B_PE_13_1,
    /* fifo */ fifo_C_drain_PE_12_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_12_2,
    /* fifo */ fifo_A_PE_12_3,
    /* fifo */ fifo_B_PE_12_2,
    /* fifo */ fifo_B_PE_13_2,
    /* fifo */ fifo_C_drain_PE_12_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_12_3,
    /* fifo */ fifo_A_PE_12_4,
    /* fifo */ fifo_B_PE_12_3,
    /* fifo */ fifo_B_PE_13_3,
    /* fifo */ fifo_C_drain_PE_12_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_12_4,
    /* fifo */ fifo_A_PE_12_5,
    /* fifo */ fifo_B_PE_12_4,
    /* fifo */ fifo_B_PE_13_4,
    /* fifo */ fifo_C_drain_PE_12_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_12_5,
    /* fifo */ fifo_A_PE_12_6,
    /* fifo */ fifo_B_PE_12_5,
    /* fifo */ fifo_B_PE_13_5,
    /* fifo */ fifo_C_drain_PE_12_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_12_6,
    /* fifo */ fifo_A_PE_12_7,
    /* fifo */ fifo_B_PE_12_6,
    /* fifo */ fifo_B_PE_13_6,
    /* fifo */ fifo_C_drain_PE_12_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 12,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_12_7,
    /* fifo */ fifo_A_PE_12_8,
    /* fifo */ fifo_B_PE_12_7,
    /* fifo */ fifo_B_PE_13_7,
    /* fifo */ fifo_C_drain_PE_12_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_13_0,
    /* fifo */ fifo_A_PE_13_1,
    /* fifo */ fifo_B_PE_13_0,
    /* fifo */ fifo_B_PE_14_0,
    /* fifo */ fifo_C_drain_PE_13_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_13_1,
    /* fifo */ fifo_A_PE_13_2,
    /* fifo */ fifo_B_PE_13_1,
    /* fifo */ fifo_B_PE_14_1,
    /* fifo */ fifo_C_drain_PE_13_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_13_2,
    /* fifo */ fifo_A_PE_13_3,
    /* fifo */ fifo_B_PE_13_2,
    /* fifo */ fifo_B_PE_14_2,
    /* fifo */ fifo_C_drain_PE_13_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_13_3,
    /* fifo */ fifo_A_PE_13_4,
    /* fifo */ fifo_B_PE_13_3,
    /* fifo */ fifo_B_PE_14_3,
    /* fifo */ fifo_C_drain_PE_13_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_13_4,
    /* fifo */ fifo_A_PE_13_5,
    /* fifo */ fifo_B_PE_13_4,
    /* fifo */ fifo_B_PE_14_4,
    /* fifo */ fifo_C_drain_PE_13_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_13_5,
    /* fifo */ fifo_A_PE_13_6,
    /* fifo */ fifo_B_PE_13_5,
    /* fifo */ fifo_B_PE_14_5,
    /* fifo */ fifo_C_drain_PE_13_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_13_6,
    /* fifo */ fifo_A_PE_13_7,
    /* fifo */ fifo_B_PE_13_6,
    /* fifo */ fifo_B_PE_14_6,
    /* fifo */ fifo_C_drain_PE_13_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 13,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_13_7,
    /* fifo */ fifo_A_PE_13_8,
    /* fifo */ fifo_B_PE_13_7,
    /* fifo */ fifo_B_PE_14_7,
    /* fifo */ fifo_C_drain_PE_13_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_14_0,
    /* fifo */ fifo_A_PE_14_1,
    /* fifo */ fifo_B_PE_14_0,
    /* fifo */ fifo_B_PE_15_0,
    /* fifo */ fifo_C_drain_PE_14_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_14_1,
    /* fifo */ fifo_A_PE_14_2,
    /* fifo */ fifo_B_PE_14_1,
    /* fifo */ fifo_B_PE_15_1,
    /* fifo */ fifo_C_drain_PE_14_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_14_2,
    /* fifo */ fifo_A_PE_14_3,
    /* fifo */ fifo_B_PE_14_2,
    /* fifo */ fifo_B_PE_15_2,
    /* fifo */ fifo_C_drain_PE_14_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_14_3,
    /* fifo */ fifo_A_PE_14_4,
    /* fifo */ fifo_B_PE_14_3,
    /* fifo */ fifo_B_PE_15_3,
    /* fifo */ fifo_C_drain_PE_14_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_14_4,
    /* fifo */ fifo_A_PE_14_5,
    /* fifo */ fifo_B_PE_14_4,
    /* fifo */ fifo_B_PE_15_4,
    /* fifo */ fifo_C_drain_PE_14_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_14_5,
    /* fifo */ fifo_A_PE_14_6,
    /* fifo */ fifo_B_PE_14_5,
    /* fifo */ fifo_B_PE_15_5,
    /* fifo */ fifo_C_drain_PE_14_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_14_6,
    /* fifo */ fifo_A_PE_14_7,
    /* fifo */ fifo_B_PE_14_6,
    /* fifo */ fifo_B_PE_15_6,
    /* fifo */ fifo_C_drain_PE_14_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 14,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_14_7,
    /* fifo */ fifo_A_PE_14_8,
    /* fifo */ fifo_B_PE_14_7,
    /* fifo */ fifo_B_PE_15_7,
    /* fifo */ fifo_C_drain_PE_14_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_15_0,
    /* fifo */ fifo_A_PE_15_1,
    /* fifo */ fifo_B_PE_15_0,
    /* fifo */ fifo_B_PE_16_0,
    /* fifo */ fifo_C_drain_PE_15_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_15_1,
    /* fifo */ fifo_A_PE_15_2,
    /* fifo */ fifo_B_PE_15_1,
    /* fifo */ fifo_B_PE_16_1,
    /* fifo */ fifo_C_drain_PE_15_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_15_2,
    /* fifo */ fifo_A_PE_15_3,
    /* fifo */ fifo_B_PE_15_2,
    /* fifo */ fifo_B_PE_16_2,
    /* fifo */ fifo_C_drain_PE_15_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_15_3,
    /* fifo */ fifo_A_PE_15_4,
    /* fifo */ fifo_B_PE_15_3,
    /* fifo */ fifo_B_PE_16_3,
    /* fifo */ fifo_C_drain_PE_15_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_15_4,
    /* fifo */ fifo_A_PE_15_5,
    /* fifo */ fifo_B_PE_15_4,
    /* fifo */ fifo_B_PE_16_4,
    /* fifo */ fifo_C_drain_PE_15_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_15_5,
    /* fifo */ fifo_A_PE_15_6,
    /* fifo */ fifo_B_PE_15_5,
    /* fifo */ fifo_B_PE_16_5,
    /* fifo */ fifo_C_drain_PE_15_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_15_6,
    /* fifo */ fifo_A_PE_15_7,
    /* fifo */ fifo_B_PE_15_6,
    /* fifo */ fifo_B_PE_16_6,
    /* fifo */ fifo_C_drain_PE_15_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 15,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_15_7,
    /* fifo */ fifo_A_PE_15_8,
    /* fifo */ fifo_B_PE_15_7,
    /* fifo */ fifo_B_PE_16_7,
    /* fifo */ fifo_C_drain_PE_15_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_16_0,
    /* fifo */ fifo_A_PE_16_1,
    /* fifo */ fifo_B_PE_16_0,
    /* fifo */ fifo_B_PE_17_0,
    /* fifo */ fifo_C_drain_PE_16_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_16_1,
    /* fifo */ fifo_A_PE_16_2,
    /* fifo */ fifo_B_PE_16_1,
    /* fifo */ fifo_B_PE_17_1,
    /* fifo */ fifo_C_drain_PE_16_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_16_2,
    /* fifo */ fifo_A_PE_16_3,
    /* fifo */ fifo_B_PE_16_2,
    /* fifo */ fifo_B_PE_17_2,
    /* fifo */ fifo_C_drain_PE_16_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_16_3,
    /* fifo */ fifo_A_PE_16_4,
    /* fifo */ fifo_B_PE_16_3,
    /* fifo */ fifo_B_PE_17_3,
    /* fifo */ fifo_C_drain_PE_16_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_16_4,
    /* fifo */ fifo_A_PE_16_5,
    /* fifo */ fifo_B_PE_16_4,
    /* fifo */ fifo_B_PE_17_4,
    /* fifo */ fifo_C_drain_PE_16_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_16_5,
    /* fifo */ fifo_A_PE_16_6,
    /* fifo */ fifo_B_PE_16_5,
    /* fifo */ fifo_B_PE_17_5,
    /* fifo */ fifo_C_drain_PE_16_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_16_6,
    /* fifo */ fifo_A_PE_16_7,
    /* fifo */ fifo_B_PE_16_6,
    /* fifo */ fifo_B_PE_17_6,
    /* fifo */ fifo_C_drain_PE_16_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 16,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_16_7,
    /* fifo */ fifo_A_PE_16_8,
    /* fifo */ fifo_B_PE_16_7,
    /* fifo */ fifo_B_PE_17_7,
    /* fifo */ fifo_C_drain_PE_16_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_17_0,
    /* fifo */ fifo_A_PE_17_1,
    /* fifo */ fifo_B_PE_17_0,
    /* fifo */ fifo_B_PE_18_0,
    /* fifo */ fifo_C_drain_PE_17_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_17_1,
    /* fifo */ fifo_A_PE_17_2,
    /* fifo */ fifo_B_PE_17_1,
    /* fifo */ fifo_B_PE_18_1,
    /* fifo */ fifo_C_drain_PE_17_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_17_2,
    /* fifo */ fifo_A_PE_17_3,
    /* fifo */ fifo_B_PE_17_2,
    /* fifo */ fifo_B_PE_18_2,
    /* fifo */ fifo_C_drain_PE_17_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_17_3,
    /* fifo */ fifo_A_PE_17_4,
    /* fifo */ fifo_B_PE_17_3,
    /* fifo */ fifo_B_PE_18_3,
    /* fifo */ fifo_C_drain_PE_17_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_17_4,
    /* fifo */ fifo_A_PE_17_5,
    /* fifo */ fifo_B_PE_17_4,
    /* fifo */ fifo_B_PE_18_4,
    /* fifo */ fifo_C_drain_PE_17_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_17_5,
    /* fifo */ fifo_A_PE_17_6,
    /* fifo */ fifo_B_PE_17_5,
    /* fifo */ fifo_B_PE_18_5,
    /* fifo */ fifo_C_drain_PE_17_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_17_6,
    /* fifo */ fifo_A_PE_17_7,
    /* fifo */ fifo_B_PE_17_6,
    /* fifo */ fifo_B_PE_18_6,
    /* fifo */ fifo_C_drain_PE_17_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 17,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_17_7,
    /* fifo */ fifo_A_PE_17_8,
    /* fifo */ fifo_B_PE_17_7,
    /* fifo */ fifo_B_PE_18_7,
    /* fifo */ fifo_C_drain_PE_17_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_18_0,
    /* fifo */ fifo_A_PE_18_1,
    /* fifo */ fifo_B_PE_18_0,
    /* fifo */ fifo_B_PE_19_0,
    /* fifo */ fifo_C_drain_PE_18_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_18_1,
    /* fifo */ fifo_A_PE_18_2,
    /* fifo */ fifo_B_PE_18_1,
    /* fifo */ fifo_B_PE_19_1,
    /* fifo */ fifo_C_drain_PE_18_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_18_2,
    /* fifo */ fifo_A_PE_18_3,
    /* fifo */ fifo_B_PE_18_2,
    /* fifo */ fifo_B_PE_19_2,
    /* fifo */ fifo_C_drain_PE_18_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_18_3,
    /* fifo */ fifo_A_PE_18_4,
    /* fifo */ fifo_B_PE_18_3,
    /* fifo */ fifo_B_PE_19_3,
    /* fifo */ fifo_C_drain_PE_18_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_18_4,
    /* fifo */ fifo_A_PE_18_5,
    /* fifo */ fifo_B_PE_18_4,
    /* fifo */ fifo_B_PE_19_4,
    /* fifo */ fifo_C_drain_PE_18_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_18_5,
    /* fifo */ fifo_A_PE_18_6,
    /* fifo */ fifo_B_PE_18_5,
    /* fifo */ fifo_B_PE_19_5,
    /* fifo */ fifo_C_drain_PE_18_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_18_6,
    /* fifo */ fifo_A_PE_18_7,
    /* fifo */ fifo_B_PE_18_6,
    /* fifo */ fifo_B_PE_19_6,
    /* fifo */ fifo_C_drain_PE_18_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 18,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_18_7,
    /* fifo */ fifo_A_PE_18_8,
    /* fifo */ fifo_B_PE_18_7,
    /* fifo */ fifo_B_PE_19_7,
    /* fifo */ fifo_C_drain_PE_18_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_19_0,
    /* fifo */ fifo_A_PE_19_1,
    /* fifo */ fifo_B_PE_19_0,
    /* fifo */ fifo_B_PE_20_0,
    /* fifo */ fifo_C_drain_PE_19_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_19_1,
    /* fifo */ fifo_A_PE_19_2,
    /* fifo */ fifo_B_PE_19_1,
    /* fifo */ fifo_B_PE_20_1,
    /* fifo */ fifo_C_drain_PE_19_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_19_2,
    /* fifo */ fifo_A_PE_19_3,
    /* fifo */ fifo_B_PE_19_2,
    /* fifo */ fifo_B_PE_20_2,
    /* fifo */ fifo_C_drain_PE_19_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_19_3,
    /* fifo */ fifo_A_PE_19_4,
    /* fifo */ fifo_B_PE_19_3,
    /* fifo */ fifo_B_PE_20_3,
    /* fifo */ fifo_C_drain_PE_19_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_19_4,
    /* fifo */ fifo_A_PE_19_5,
    /* fifo */ fifo_B_PE_19_4,
    /* fifo */ fifo_B_PE_20_4,
    /* fifo */ fifo_C_drain_PE_19_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_19_5,
    /* fifo */ fifo_A_PE_19_6,
    /* fifo */ fifo_B_PE_19_5,
    /* fifo */ fifo_B_PE_20_5,
    /* fifo */ fifo_C_drain_PE_19_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_19_6,
    /* fifo */ fifo_A_PE_19_7,
    /* fifo */ fifo_B_PE_19_6,
    /* fifo */ fifo_B_PE_20_6,
    /* fifo */ fifo_C_drain_PE_19_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 19,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_19_7,
    /* fifo */ fifo_A_PE_19_8,
    /* fifo */ fifo_B_PE_19_7,
    /* fifo */ fifo_B_PE_20_7,
    /* fifo */ fifo_C_drain_PE_19_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_20_0,
    /* fifo */ fifo_A_PE_20_1,
    /* fifo */ fifo_B_PE_20_0,
    /* fifo */ fifo_B_PE_21_0,
    /* fifo */ fifo_C_drain_PE_20_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_20_1,
    /* fifo */ fifo_A_PE_20_2,
    /* fifo */ fifo_B_PE_20_1,
    /* fifo */ fifo_B_PE_21_1,
    /* fifo */ fifo_C_drain_PE_20_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_20_2,
    /* fifo */ fifo_A_PE_20_3,
    /* fifo */ fifo_B_PE_20_2,
    /* fifo */ fifo_B_PE_21_2,
    /* fifo */ fifo_C_drain_PE_20_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_20_3,
    /* fifo */ fifo_A_PE_20_4,
    /* fifo */ fifo_B_PE_20_3,
    /* fifo */ fifo_B_PE_21_3,
    /* fifo */ fifo_C_drain_PE_20_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_20_4,
    /* fifo */ fifo_A_PE_20_5,
    /* fifo */ fifo_B_PE_20_4,
    /* fifo */ fifo_B_PE_21_4,
    /* fifo */ fifo_C_drain_PE_20_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_20_5,
    /* fifo */ fifo_A_PE_20_6,
    /* fifo */ fifo_B_PE_20_5,
    /* fifo */ fifo_B_PE_21_5,
    /* fifo */ fifo_C_drain_PE_20_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_20_6,
    /* fifo */ fifo_A_PE_20_7,
    /* fifo */ fifo_B_PE_20_6,
    /* fifo */ fifo_B_PE_21_6,
    /* fifo */ fifo_C_drain_PE_20_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 20,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_20_7,
    /* fifo */ fifo_A_PE_20_8,
    /* fifo */ fifo_B_PE_20_7,
    /* fifo */ fifo_B_PE_21_7,
    /* fifo */ fifo_C_drain_PE_20_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_21_0,
    /* fifo */ fifo_A_PE_21_1,
    /* fifo */ fifo_B_PE_21_0,
    /* fifo */ fifo_B_PE_22_0,
    /* fifo */ fifo_C_drain_PE_21_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_21_1,
    /* fifo */ fifo_A_PE_21_2,
    /* fifo */ fifo_B_PE_21_1,
    /* fifo */ fifo_B_PE_22_1,
    /* fifo */ fifo_C_drain_PE_21_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_21_2,
    /* fifo */ fifo_A_PE_21_3,
    /* fifo */ fifo_B_PE_21_2,
    /* fifo */ fifo_B_PE_22_2,
    /* fifo */ fifo_C_drain_PE_21_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_21_3,
    /* fifo */ fifo_A_PE_21_4,
    /* fifo */ fifo_B_PE_21_3,
    /* fifo */ fifo_B_PE_22_3,
    /* fifo */ fifo_C_drain_PE_21_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_21_4,
    /* fifo */ fifo_A_PE_21_5,
    /* fifo */ fifo_B_PE_21_4,
    /* fifo */ fifo_B_PE_22_4,
    /* fifo */ fifo_C_drain_PE_21_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_21_5,
    /* fifo */ fifo_A_PE_21_6,
    /* fifo */ fifo_B_PE_21_5,
    /* fifo */ fifo_B_PE_22_5,
    /* fifo */ fifo_C_drain_PE_21_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_21_6,
    /* fifo */ fifo_A_PE_21_7,
    /* fifo */ fifo_B_PE_21_6,
    /* fifo */ fifo_B_PE_22_6,
    /* fifo */ fifo_C_drain_PE_21_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 21,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_21_7,
    /* fifo */ fifo_A_PE_21_8,
    /* fifo */ fifo_B_PE_21_7,
    /* fifo */ fifo_B_PE_22_7,
    /* fifo */ fifo_C_drain_PE_21_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_22_0,
    /* fifo */ fifo_A_PE_22_1,
    /* fifo */ fifo_B_PE_22_0,
    /* fifo */ fifo_B_PE_23_0,
    /* fifo */ fifo_C_drain_PE_22_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_22_1,
    /* fifo */ fifo_A_PE_22_2,
    /* fifo */ fifo_B_PE_22_1,
    /* fifo */ fifo_B_PE_23_1,
    /* fifo */ fifo_C_drain_PE_22_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_22_2,
    /* fifo */ fifo_A_PE_22_3,
    /* fifo */ fifo_B_PE_22_2,
    /* fifo */ fifo_B_PE_23_2,
    /* fifo */ fifo_C_drain_PE_22_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_22_3,
    /* fifo */ fifo_A_PE_22_4,
    /* fifo */ fifo_B_PE_22_3,
    /* fifo */ fifo_B_PE_23_3,
    /* fifo */ fifo_C_drain_PE_22_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_22_4,
    /* fifo */ fifo_A_PE_22_5,
    /* fifo */ fifo_B_PE_22_4,
    /* fifo */ fifo_B_PE_23_4,
    /* fifo */ fifo_C_drain_PE_22_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_22_5,
    /* fifo */ fifo_A_PE_22_6,
    /* fifo */ fifo_B_PE_22_5,
    /* fifo */ fifo_B_PE_23_5,
    /* fifo */ fifo_C_drain_PE_22_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_22_6,
    /* fifo */ fifo_A_PE_22_7,
    /* fifo */ fifo_B_PE_22_6,
    /* fifo */ fifo_B_PE_23_6,
    /* fifo */ fifo_C_drain_PE_22_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 22,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_22_7,
    /* fifo */ fifo_A_PE_22_8,
    /* fifo */ fifo_B_PE_22_7,
    /* fifo */ fifo_B_PE_23_7,
    /* fifo */ fifo_C_drain_PE_22_7
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 0,
    /* fifo */ fifo_A_PE_23_0,
    /* fifo */ fifo_A_PE_23_1,
    /* fifo */ fifo_B_PE_23_0,
    /* fifo */ fifo_B_PE_24_0,
    /* fifo */ fifo_C_drain_PE_23_0
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 1,
    /* fifo */ fifo_A_PE_23_1,
    /* fifo */ fifo_A_PE_23_2,
    /* fifo */ fifo_B_PE_23_1,
    /* fifo */ fifo_B_PE_24_1,
    /* fifo */ fifo_C_drain_PE_23_1
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 2,
    /* fifo */ fifo_A_PE_23_2,
    /* fifo */ fifo_A_PE_23_3,
    /* fifo */ fifo_B_PE_23_2,
    /* fifo */ fifo_B_PE_24_2,
    /* fifo */ fifo_C_drain_PE_23_2
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 3,
    /* fifo */ fifo_A_PE_23_3,
    /* fifo */ fifo_A_PE_23_4,
    /* fifo */ fifo_B_PE_23_3,
    /* fifo */ fifo_B_PE_24_3,
    /* fifo */ fifo_C_drain_PE_23_3
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 4,
    /* fifo */ fifo_A_PE_23_4,
    /* fifo */ fifo_A_PE_23_5,
    /* fifo */ fifo_B_PE_23_4,
    /* fifo */ fifo_B_PE_24_4,
    /* fifo */ fifo_C_drain_PE_23_4
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 5,
    /* fifo */ fifo_A_PE_23_5,
    /* fifo */ fifo_A_PE_23_6,
    /* fifo */ fifo_B_PE_23_5,
    /* fifo */ fifo_B_PE_24_5,
    /* fifo */ fifo_C_drain_PE_23_5
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 6,
    /* fifo */ fifo_A_PE_23_6,
    /* fifo */ fifo_A_PE_23_7,
    /* fifo */ fifo_B_PE_23_6,
    /* fifo */ fifo_B_PE_24_6,
    /* fifo */ fifo_C_drain_PE_23_6
  );
  /* Module Call */

  /* Module Call */
  PE_wrapper(
    /* module id */ 23,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_23_7,
    /* fifo */ fifo_A_PE_23_8,
    /* fifo */ fifo_B_PE_23_7,
    /* fifo */ fifo_B_PE_24_7,
    /* fifo */ fifo_C_drain_PE_23_7
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 0,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_0_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 1,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_1_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 2,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_2_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 3,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_3_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 4,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_4_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 5,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_5_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 6,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_6_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 7,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_7_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 8,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_8_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 9,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_9_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 10,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_10_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 11,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_11_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 12,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_12_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 13,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_13_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 14,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_14_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 15,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_15_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 16,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_16_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 17,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_17_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 18,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_18_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 19,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_19_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 20,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_20_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 21,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_21_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 22,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_22_8
  );
  /* Module Call */

  /* Module Call */
  A_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 7,
    /* fifo */ fifo_A_PE_23_8
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 0,
    /* fifo */ fifo_B_PE_24_0
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 1,
    /* fifo */ fifo_B_PE_24_1
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 2,
    /* fifo */ fifo_B_PE_24_2
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 3,
    /* fifo */ fifo_B_PE_24_3
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 4,
    /* fifo */ fifo_B_PE_24_4
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 5,
    /* fifo */ fifo_B_PE_24_5
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 6,
    /* fifo */ fifo_B_PE_24_6
  );
  /* Module Call */

  /* Module Call */
  B_PE_dummy_in(
    /* module id */ 23,
    /* module id */ 7,
    /* fifo */ fifo_B_PE_24_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 0,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_23,
    /* fifo */ fifo_C_drain_PE_23_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_22,
    /* fifo */ fifo_C_drain_PE_22_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_21,
    /* fifo */ fifo_C_drain_PE_21_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_20,
    /* fifo */ fifo_C_drain_PE_20_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_19,
    /* fifo */ fifo_C_drain_PE_19_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_18,
    /* fifo */ fifo_C_drain_PE_18_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_17,
    /* fifo */ fifo_C_drain_PE_17_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_16,
    /* fifo */ fifo_C_drain_PE_16_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_15,
    /* fifo */ fifo_C_drain_PE_15_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_14,
    /* fifo */ fifo_C_drain_PE_14_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_13,
    /* fifo */ fifo_C_drain_PE_13_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_12,
    /* fifo */ fifo_C_drain_PE_12_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_11,
    /* fifo */ fifo_C_drain_PE_11_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_10,
    /* fifo */ fifo_C_drain_PE_10_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_9,
    /* fifo */ fifo_C_drain_PE_9_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_8,
    /* fifo */ fifo_C_drain_PE_8_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_7,
    /* fifo */ fifo_C_drain_PE_7_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_6,
    /* fifo */ fifo_C_drain_PE_6_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_5,
    /* fifo */ fifo_C_drain_PE_5_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_4,
    /* fifo */ fifo_C_drain_PE_4_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_3,
    /* fifo */ fifo_C_drain_PE_3_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_2,
    /* fifo */ fifo_C_drain_PE_2_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_1,
    /* fifo */ fifo_C_drain_PE_1_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 0,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_0,
    /* fifo */ fifo_C_drain_PE_0_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 1,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_23,
    /* fifo */ fifo_C_drain_PE_23_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_22,
    /* fifo */ fifo_C_drain_PE_22_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_21,
    /* fifo */ fifo_C_drain_PE_21_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_20,
    /* fifo */ fifo_C_drain_PE_20_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_19,
    /* fifo */ fifo_C_drain_PE_19_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_18,
    /* fifo */ fifo_C_drain_PE_18_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_17,
    /* fifo */ fifo_C_drain_PE_17_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_16,
    /* fifo */ fifo_C_drain_PE_16_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_15,
    /* fifo */ fifo_C_drain_PE_15_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_14,
    /* fifo */ fifo_C_drain_PE_14_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_13,
    /* fifo */ fifo_C_drain_PE_13_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_12,
    /* fifo */ fifo_C_drain_PE_12_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_11,
    /* fifo */ fifo_C_drain_PE_11_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_10,
    /* fifo */ fifo_C_drain_PE_10_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_9,
    /* fifo */ fifo_C_drain_PE_9_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_8,
    /* fifo */ fifo_C_drain_PE_8_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_7,
    /* fifo */ fifo_C_drain_PE_7_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_6,
    /* fifo */ fifo_C_drain_PE_6_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_5,
    /* fifo */ fifo_C_drain_PE_5_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_4,
    /* fifo */ fifo_C_drain_PE_4_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_3,
    /* fifo */ fifo_C_drain_PE_3_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_2,
    /* fifo */ fifo_C_drain_PE_2_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_1,
    /* fifo */ fifo_C_drain_PE_1_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 1,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_0,
    /* fifo */ fifo_C_drain_PE_0_1
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 2,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_23,
    /* fifo */ fifo_C_drain_PE_23_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_22,
    /* fifo */ fifo_C_drain_PE_22_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_21,
    /* fifo */ fifo_C_drain_PE_21_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_20,
    /* fifo */ fifo_C_drain_PE_20_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_19,
    /* fifo */ fifo_C_drain_PE_19_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_18,
    /* fifo */ fifo_C_drain_PE_18_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_17,
    /* fifo */ fifo_C_drain_PE_17_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_16,
    /* fifo */ fifo_C_drain_PE_16_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_15,
    /* fifo */ fifo_C_drain_PE_15_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_14,
    /* fifo */ fifo_C_drain_PE_14_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_13,
    /* fifo */ fifo_C_drain_PE_13_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_12,
    /* fifo */ fifo_C_drain_PE_12_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_11,
    /* fifo */ fifo_C_drain_PE_11_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_10,
    /* fifo */ fifo_C_drain_PE_10_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_9,
    /* fifo */ fifo_C_drain_PE_9_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_8,
    /* fifo */ fifo_C_drain_PE_8_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_7,
    /* fifo */ fifo_C_drain_PE_7_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_6,
    /* fifo */ fifo_C_drain_PE_6_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_5,
    /* fifo */ fifo_C_drain_PE_5_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_4,
    /* fifo */ fifo_C_drain_PE_4_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_3,
    /* fifo */ fifo_C_drain_PE_3_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_2,
    /* fifo */ fifo_C_drain_PE_2_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_1,
    /* fifo */ fifo_C_drain_PE_1_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 2,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_0,
    /* fifo */ fifo_C_drain_PE_0_2
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 3,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_23,
    /* fifo */ fifo_C_drain_PE_23_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_22,
    /* fifo */ fifo_C_drain_PE_22_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_21,
    /* fifo */ fifo_C_drain_PE_21_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_20,
    /* fifo */ fifo_C_drain_PE_20_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_19,
    /* fifo */ fifo_C_drain_PE_19_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_18,
    /* fifo */ fifo_C_drain_PE_18_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_17,
    /* fifo */ fifo_C_drain_PE_17_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_16,
    /* fifo */ fifo_C_drain_PE_16_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_15,
    /* fifo */ fifo_C_drain_PE_15_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_14,
    /* fifo */ fifo_C_drain_PE_14_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_13,
    /* fifo */ fifo_C_drain_PE_13_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_12,
    /* fifo */ fifo_C_drain_PE_12_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_11,
    /* fifo */ fifo_C_drain_PE_11_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_10,
    /* fifo */ fifo_C_drain_PE_10_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_9,
    /* fifo */ fifo_C_drain_PE_9_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_8,
    /* fifo */ fifo_C_drain_PE_8_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_7,
    /* fifo */ fifo_C_drain_PE_7_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_6,
    /* fifo */ fifo_C_drain_PE_6_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_5,
    /* fifo */ fifo_C_drain_PE_5_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_4,
    /* fifo */ fifo_C_drain_PE_4_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_3,
    /* fifo */ fifo_C_drain_PE_3_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_2,
    /* fifo */ fifo_C_drain_PE_2_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_1,
    /* fifo */ fifo_C_drain_PE_1_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 3,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_0,
    /* fifo */ fifo_C_drain_PE_0_3
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 4,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_23,
    /* fifo */ fifo_C_drain_PE_23_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_22,
    /* fifo */ fifo_C_drain_PE_22_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_21,
    /* fifo */ fifo_C_drain_PE_21_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_20,
    /* fifo */ fifo_C_drain_PE_20_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_19,
    /* fifo */ fifo_C_drain_PE_19_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_18,
    /* fifo */ fifo_C_drain_PE_18_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_17,
    /* fifo */ fifo_C_drain_PE_17_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_16,
    /* fifo */ fifo_C_drain_PE_16_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_15,
    /* fifo */ fifo_C_drain_PE_15_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_14,
    /* fifo */ fifo_C_drain_PE_14_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_13,
    /* fifo */ fifo_C_drain_PE_13_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_12,
    /* fifo */ fifo_C_drain_PE_12_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_11,
    /* fifo */ fifo_C_drain_PE_11_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_10,
    /* fifo */ fifo_C_drain_PE_10_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_9,
    /* fifo */ fifo_C_drain_PE_9_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_8,
    /* fifo */ fifo_C_drain_PE_8_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_7,
    /* fifo */ fifo_C_drain_PE_7_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_6,
    /* fifo */ fifo_C_drain_PE_6_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_5,
    /* fifo */ fifo_C_drain_PE_5_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_4,
    /* fifo */ fifo_C_drain_PE_4_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_3,
    /* fifo */ fifo_C_drain_PE_3_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_2,
    /* fifo */ fifo_C_drain_PE_2_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_1,
    /* fifo */ fifo_C_drain_PE_1_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 4,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_0,
    /* fifo */ fifo_C_drain_PE_0_4
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 5,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_23,
    /* fifo */ fifo_C_drain_PE_23_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_22,
    /* fifo */ fifo_C_drain_PE_22_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_21,
    /* fifo */ fifo_C_drain_PE_21_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_20,
    /* fifo */ fifo_C_drain_PE_20_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_19,
    /* fifo */ fifo_C_drain_PE_19_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_18,
    /* fifo */ fifo_C_drain_PE_18_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_17,
    /* fifo */ fifo_C_drain_PE_17_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_16,
    /* fifo */ fifo_C_drain_PE_16_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_15,
    /* fifo */ fifo_C_drain_PE_15_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_14,
    /* fifo */ fifo_C_drain_PE_14_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_13,
    /* fifo */ fifo_C_drain_PE_13_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_12,
    /* fifo */ fifo_C_drain_PE_12_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_11,
    /* fifo */ fifo_C_drain_PE_11_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_10,
    /* fifo */ fifo_C_drain_PE_10_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_9,
    /* fifo */ fifo_C_drain_PE_9_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_8,
    /* fifo */ fifo_C_drain_PE_8_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_7,
    /* fifo */ fifo_C_drain_PE_7_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_6,
    /* fifo */ fifo_C_drain_PE_6_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_5,
    /* fifo */ fifo_C_drain_PE_5_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_4,
    /* fifo */ fifo_C_drain_PE_4_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_3,
    /* fifo */ fifo_C_drain_PE_3_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_2,
    /* fifo */ fifo_C_drain_PE_2_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_1,
    /* fifo */ fifo_C_drain_PE_1_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 5,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_0,
    /* fifo */ fifo_C_drain_PE_0_5
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 6,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_23,
    /* fifo */ fifo_C_drain_PE_23_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_22,
    /* fifo */ fifo_C_drain_PE_22_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_21,
    /* fifo */ fifo_C_drain_PE_21_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_20,
    /* fifo */ fifo_C_drain_PE_20_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_19,
    /* fifo */ fifo_C_drain_PE_19_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_18,
    /* fifo */ fifo_C_drain_PE_18_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_17,
    /* fifo */ fifo_C_drain_PE_17_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_16,
    /* fifo */ fifo_C_drain_PE_16_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_15,
    /* fifo */ fifo_C_drain_PE_15_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_14,
    /* fifo */ fifo_C_drain_PE_14_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_13,
    /* fifo */ fifo_C_drain_PE_13_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_12,
    /* fifo */ fifo_C_drain_PE_12_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_11,
    /* fifo */ fifo_C_drain_PE_11_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_10,
    /* fifo */ fifo_C_drain_PE_10_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_9,
    /* fifo */ fifo_C_drain_PE_9_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_8,
    /* fifo */ fifo_C_drain_PE_8_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_7,
    /* fifo */ fifo_C_drain_PE_7_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_6,
    /* fifo */ fifo_C_drain_PE_6_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_5,
    /* fifo */ fifo_C_drain_PE_5_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_4,
    /* fifo */ fifo_C_drain_PE_4_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_3,
    /* fifo */ fifo_C_drain_PE_3_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_2,
    /* fifo */ fifo_C_drain_PE_2_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_1,
    /* fifo */ fifo_C_drain_PE_1_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 6,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_0,
    /* fifo */ fifo_C_drain_PE_0_6
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_boundary_wrapper(
    /* module id */ 7,
    /* module id */ 23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_23,
    /* fifo */ fifo_C_drain_PE_23_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_23,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_22,
    /* fifo */ fifo_C_drain_PE_22_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_22,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_21,
    /* fifo */ fifo_C_drain_PE_21_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_21,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_20,
    /* fifo */ fifo_C_drain_PE_20_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_20,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_19,
    /* fifo */ fifo_C_drain_PE_19_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_19,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_18,
    /* fifo */ fifo_C_drain_PE_18_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_18,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_17,
    /* fifo */ fifo_C_drain_PE_17_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_17,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_16,
    /* fifo */ fifo_C_drain_PE_16_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_16,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_15,
    /* fifo */ fifo_C_drain_PE_15_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_15,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_14,
    /* fifo */ fifo_C_drain_PE_14_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_14,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_13,
    /* fifo */ fifo_C_drain_PE_13_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_13,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_12,
    /* fifo */ fifo_C_drain_PE_12_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_12,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_11,
    /* fifo */ fifo_C_drain_PE_11_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_11,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_10,
    /* fifo */ fifo_C_drain_PE_10_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_10,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_9,
    /* fifo */ fifo_C_drain_PE_9_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_9,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_8,
    /* fifo */ fifo_C_drain_PE_8_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_8,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_7,
    /* fifo */ fifo_C_drain_PE_7_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_6,
    /* fifo */ fifo_C_drain_PE_6_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_5,
    /* fifo */ fifo_C_drain_PE_5_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_4,
    /* fifo */ fifo_C_drain_PE_4_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_3,
    /* fifo */ fifo_C_drain_PE_3_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_2,
    /* fifo */ fifo_C_drain_PE_2_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_1,
    /* fifo */ fifo_C_drain_PE_1_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L1_out_wrapper(
    /* module id */ 7,
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_0,
    /* fifo */ fifo_C_drain_PE_0_7
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out_boundary(
    /* module id */ 7,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_7_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 6,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_7,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_6_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 5,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_6,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_5_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 4,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_5,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_4_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 3,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_4,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_3_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 2,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_3,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_2_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 1,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_2,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L2_out(
    /* module id */ 0,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_1,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_0,
    /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L3_out(
    /* fifo */ fifo_C_drain_C_drain_IO_L3_out_serialize,
    /* fifo */ fifo_C_drain_C_drain_IO_L2_out_0
  );
  /* Module Call */

  /* Module Call */
  C_drain_IO_L3_out_serialize(
    /* array */ C,
    /* fifo */ fifo_C_drain_C_drain_IO_L3_out_serialize
  );
  /* Module Call */

}
}


================================================
FILE: autosa_tests/large/mm_int8/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel3": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/mm_int8/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/mm_int8/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_examples/mm_int8_ab_pe/kernel0' # path to your hls project
#project_path = '/home/jaywang/doc_examples/mm_ab/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 3
DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

DDR_loc_2d_y['C_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['C_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_x['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_y['kernel0_entry12_U0'] = 1
DDR_loc_2d_x['kernel0_entry12_U0'] = 1

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 0, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
max_usage_ratio_2d = [ [0.8, 0.6], [0.8, 0.6], [0.8, 0.8], [0.8, 0.6] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = '/home/jaywang/doc_examples/mm_int8_ab_pe/autobridge'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/mm_int8/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/mm_int8/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
#STRATEGY="Default" 
STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=A
ARG_FOR_DDR_2=B
#ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"
ARG_FOR_DDR_4=C

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/large/mm_int8/unroll.py
================================================
import math

# Modify the parameters here
UNROLL_FACTOR = 64
DATA_T = 'char'

# Generate the code
data_type = DATA_T
level = int(math.log2(UNROLL_FACTOR))
for layer in range(level - 1, -1, -1):
    pair = int(math.pow(2, layer))
    for i in range(pair):
        # data_t tmp_[layer]_[pair] = tmp_[layer+1]_[pair*2]_[pair*2+1]
        if layer == level - 1:
            print(f'{data_type} mul_{layer}_{i}_0 = local_A[0][{i*2}] * local_B[0][{i*2}];')
            print(f'{data_type} add_{layer}_{i} = mul_{layer}_{i}_0 + local_A[0][{i*2+1}] * local_B[0][{i*2+1}];')
        else:
            print(f'{data_type} add_{layer}_{i} = add_{layer+1}_{i*2} + add_{layer+1}_{i*2+1};')

# Add resource
for layer in range(level - 1, -1, -1):
    pair = int(math.pow(2, layer))
    for i in range(pair):
        if layer == level - 1:
            print(f'#pragma HLS RESOURCE variable=mul_{layer}_{i}_0 core=Mul_LUT')
        else:
            print(f'#pragma HLS RESOURCE variable=add_{layer}_{i} core=AddSub')

print('local_C[c7][c6] += add_0_0;')


================================================
FILE: autosa_tests/large/mm_intel/Makefile
================================================
APP ?= kernel
AOCL_BOARD ?= s10mx_hbm_es
SW_EMU_AOCX ?= $(APP)_sw_emu.aocx
HW_EMU_AOCX ?= $(APP)_hw_emu.aocx
HW_AOCX ?= $(APP)_hw.aocx
AOCO ?= $(APP).aoco
AOCR ?= $(APP).aocr

# Compiler
AOC ?= aoc
CXX ?= g++
AOC_FLAGS ?= -board=$(AOCL_BOARD) -fp-relaxed -report -hyper-optimized-handshaking=off -I $(INTELFPGAOCLSDKROOT)/include/kernel_headers

TARGET ?= host
SW_EMU_TARGET ?= host_sw_emu
TARGET_DIR ?= bin
AOCL_UTILS ?= $(INTELFPGAOCLSDKROOT)/examples_aoc/common

# Directories
INC_DIRS := src $(AOCL_UTILS)/inc
LIB_DIRS := 

# Files
INCS := $(wildcard src/*.h)
HOST_SRCS := $(wildcard src/$(APP)_host.cpp $(AOCL_UTILS)/src/AOCLUtils/*.cpp)
KERNEL_SRCS := src/$(APP)_kernel.cl

ifeq ($(VERBOSE),1)
ECHO := 
else
ECHO := @
endif

# Where is the Intel(R) FPGA SDK for OpenCL(TM) software?
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation)
endif
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)/host/include/CL/opencl.h),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation.)
endif

# OpenCL compile and link flags.
AOCL_COMPILE_CONFIG := $(shell aocl compile-config )
AOCL_LINK_LIBS := $(shell aocl ldlibs )
AOCL_LINK_FLAGS := $(shell aocl ldflags )
# Linking with defences enabled
AOCL_LINK_FLAGS += -z noexecstack
AOCL_LINK_FLAGS += -Wl,-z,relro,-z,now
AOCL_LINK_FLAGS += -Wl,-Bsymbolic
AOCL_LINK_FLAGS += -pie
AOCL_LINK_CONFIG := $(AOCL_LINK_FLAGS) $(AOCL_LINK_LIBS)

# Compilation flags
ifeq ($(DEBUG),1)
CXXFLAGS += -g
else
CXXFLAGS += -O2
endif
CXXFLAGS += -std=gnu++0x

# Compiling with defences enabled
CXXFLAGS += -fstack-protector
CXXFLAGS += -D_FORTIFY_SOURCE=2
CXXFLAGS += -Wformat -Wformat-security
CXXFLAGS += -fPIE

# We must force GCC to never assume that it can shove in its own
# sse2/sse3 versions of strlen and strcmp because they will CRASH.
# Very hard to debug!
CXXFLAGS += -fPIC

LIBS := rt pthread

## Make it all!
#all : $(TARGET_DIR)/$(TARGET)

sw_emu : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)

hls: $(TARGET_DIR)/$(AOCR)

hw : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)

hw_emu: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)

hw_emu_check: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)
	CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(HW_EMU_AOCX)

sw_emu_check : $(TARGET_DIR)/$(SW_EMU_TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)
	CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(SW_EMU_AOCX)

hw_check : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)
	$(TARGET_DIR)/$(TARGET) $(HW_AOCX)

# Host executable target.
$(TARGET_DIR)/$(TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET)

$(TARGET_DIR)/$(SW_EMU_TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET) -DEMULATE

$(TARGET_DIR) :
	$(ECHO)mkdir $(TARGET_DIR)

$(TARGET_DIR)/$(SW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=emulator -legacy-emulator -o $@ $^

$(TARGET_DIR)/$(HW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=simulator -ghdl -o $@ $^

$(TARGET_DIR)/$(HW_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -o $@ $^

$(TARGET_DIR)/$(AOCO) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -c -o $@ $^

$(TARGET_DIR)/$(AOCR) : $(TARGET_DIR)/$(AOCO)
	$(AOC) $(AOC_FLAGS) -rtl -o $@ $^

# Standard make targets
clean :
	$(ECHO)rm -rf $(TARGET_DIR)/*

.PHONY : all clean


================================================
FILE: autosa_tests/large/mm_intel/README.md
================================================
# Matrix Multiplication (Large)

Board        | Software Version
-------------|-----------------
Stratix 10 | Intel FPGA SDK for OpenCL 19.4

__Files__:
```
autosa_tests/large/mm_intel/kernel.c
autosa_tests/large/mm_intel/kernel.h
autosa_tests/large/mm_intel/simd_info.json
autosa_tests/large/mm_intel/Makefile
```

__Command__:
```c
./autosa ./autosa_tests/large/mm_intel/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_opencl --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[260,256,512];kernel[]->latency[20,16];kernel[]->simd[8]}" --simd-info=./autosa_tests/large/mm_intel/simd_info.json --host-serialize --loop-infinitize --double-buffer-style=0 --mem-port-map="{kernel[]->A[0];kernel[]->B[1];kernel[]->C[2]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/mm_intel/Makefile autosa.tmp/output/
```

Execute the makefile to perform software emulation
```
make sw_emu_check
```
or synthesize the design to RTL
```
make hls
```
or generate the bitstream
```
make hw
```

================================================
FILE: autosa_tests/large/mm_intel/kernel.c
================================================
#include "kernel.h"

//#define LAYOUT1
#define LAYOUT2
//#define LAYOUT3

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
#ifdef LAYOUT2  
  static data_t A[I][K], B[J][K], C[I][J], C_golden[I][J]; // gemm0,3
#endif  
#ifdef LAYOUT3  
  static data_t A[K][I], B[K][J], C[I][J], C_golden[I][J]; // gemm4
#endif  

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
#ifdef LAYOUT2      
      A[i][k] = k;
#endif
#ifdef LAYOUT3      
      A[k][i] = k;
#endif      
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
#ifdef LAYOUT2      
      B[j][k] = k;
#endif
#ifdef LAYOUT3      
      B[k][j] = k;
#endif      
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifdef LAYOUT2        
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
#endif
#ifdef LAYOUT3      
        C[i][j] = C[i][j] + A[k][i] * B[k][j];
#endif        
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifdef LAYOUT2        
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
#endif
#ifdef LAYOUT3        
        C_golden[i][j] = C_golden[i][j] + A[k][i] * B[k][j];
#endif        
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/large/mm_intel/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;

#define I 1040 
#define J 1024
#define K 1024


================================================
FILE: autosa_tests/large/mm_intel/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/mttkrp/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/mttkrp/README.md
================================================
# Matricized Tensor Times Khatri-Rao Product (MTTKRP)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/mttkrp/kernel.c
autosa_tests/large/mttkrp/kernel.h
autosa_tests/large/mttkrp/simd_info.json
autosa_tests/large/mttkrp/Makefile
autosa_tests/large/mttkrp/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/mttkrp/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[128,128,2];kernel[]->latency[16,8];kernel[]->simd[8,1]}" --simd-info=./autosa_tests/large/mttkrp/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/mttkrp/Makefile autosa.tmp/output/
cp autosa_tests/large/mttkrp/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/mttkrp/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]
sp=kernel0_1.D:DDR[3]

================================================
FILE: autosa_tests/large/mttkrp/kernel.c
================================================
/*
 * This code implements the Matricized Tensor Times Khatri-Rao Product (MTTKRP), which performs:
 * D(i,j) += A(i,k,l) * B(k,j) * C(l,j)
 * Input: A[I][K][L], B[K][J], C[L][J]
 * Output: D[I][J]
 */

#include "kernel.h"

int main(int argc, char **argv){
  // declarations
  static data_t A[I][K][L];
  static data_t B[K][J];
//  static data_t C[L][J];
  static data_t C[J][L];
  static data_t D[I][J];
  static data_t D_golden[I][J];

  // data initialization
  for (int i = 0; i < I; i++)
    for (int k = 0; k < K; k++) 
      for (int l = 0; l < L; l++) {
        A[i][k][l] = 2.5;
      }
  for (int k = 0; k < K; k++)
    for (int j = 0; j < J; j++) {
      B[k][j] = 2.5;
    }
  for (int l = 0; l < L; l++)
    for (int j = 0; j < J; j++) {
//      C[l][j] = 2.5;
      C[j][l] = 2.5;
    }
  data_t tmp;

  // computation
#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      D[i][j] = 0;
      for (int k = 0; k < K; k++) {
        for (int l = 0; l < L; l++) {
//          D[i][j] += A[i][k][l] * B[k][j] * C[l][j];
          D[i][j] = D[i][j] + A[i][k][l] * B[k][j] * C[j][l];
        }
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      D_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
//        for (int l = 0; l < L; l++) {
//          D_golden[i][j] += A[i][k][l] * B[k][j] * C[l][j];
//        }
        data_t tmp = 0;
        for (int l = 0; l < L; l++) {
//          tmp += A[i][k][l] * C[l][j];
          tmp += A[i][k][l] * C[j][l];
        }
        D_golden[i][j] += B[k][j] * tmp;
      }
    }

  // comparison
  int err = 0;
  float thres = 0.01;
  for (int i = 0; i < I; i++) 
    for (int j = 0; j < J; j++) {
      if (fabs((float)D_golden[i][j] - (float)D[i][j]) > thres) {
        err++;
      }
    }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
}


================================================
FILE: autosa_tests/large/mttkrp/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
#define I 256 
//#define J 256 
#define J 336
#define K 256 
#define L 256 


================================================
FILE: autosa_tests/large/mttkrp/simd_info.json
================================================
{
  "kernel3": {
    "reduction": ["y", "y"]
  }
}


================================================
FILE: autosa_tests/large/mttkrp/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/mttkrp/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_examples/mttkrp_ab/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 3
DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

DDR_loc_2d_y['C_IO_L3_in_serialize_U0'] = 2
DDR_loc_2d_x['C_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 2
DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

DDR_loc_2d_y['D_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['D_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_D_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_D_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_y['kernel0_entry16_U0'] = 1
DDR_loc_2d_x['kernel0_control_s_axi_U'] = 1
DDR_loc_2d_x['kernel0_entry16_U0'] = 1

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 1, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
max_usage_ratio_2d = [ [0.8, 0.75], [0.8, 0.75], [0.8, 0.75], [0.8, 0.75] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = '/home/jaywang/doc_examples/mttkrp_ab/autobridge_v2'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/mttkrp/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/mttkrp/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
#STRATEGY="Default" 
STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=A
ARG_FOR_DDR_2=B
ARG_FOR_DDR_3=C
ARG_FOR_DDR_4=D

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_3}:DDR[2] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/large/ttm/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/ttm/README.md
================================================
# Tensor Times Matrix (TTM)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/ttm/kernel.c
autosa_tests/large/ttm/kernel.h
autosa_tests/large/ttm/simd_info.json
autosa_tests/large/ttm/Makefile
autosa_tests/large/ttm/connectivity.cfg
```

__Command__:
```c
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/ttm/Makefile autosa.tmp/output/
cp autosa_tests/large/ttm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/ttm/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/large/ttm/kernel.c
================================================
/*
 * This code implements the Tensor Times Matrix (TTM), which performs:
 * C(i,j,k) += A(i,j,l) * B(l,k)
 * Input: A[I][J][L], B[L][K]
 * Output: C[I][J][K]
 */

#include "kernel.h"

int main(int argc, char **argv){
  // declarations
  static data_t A[I][J][L];
//  static data_t B[L][K];
  static data_t B[K][L];
  static data_t C[I][J][K];
  static data_t C_golden[I][J][K];

  // data initialization
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) 
      for (int l = 0; l < L; l++) {
        A[i][j][l] = 2.5;
      }
  for (int l = 0; l < L; l++)
    for (int k = 0; k < K; k++) {
//      B[l][k] = 2.5;
      B[k][l] = 2.5;
    }

  // computation
#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
//        C[i][j][k] = 0;
        for (int l = 0; l < L; l++) {
//          C[i][j][k] = C[i][j][k] + A[i][j][l] * B[l][k];
          C[i][j][k] = C[i][j][k] + A[i][j][l] * B[k][l];
        }
      }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
        C_golden[i][j][k] = 0;
        for (int l = 0; l < L; l++) {
//          C_golden[i][j][k] += A[i][j][l] * B[l][k];
          C_golden[i][j][k] += A[i][j][l] * B[k][l];
        }
      }

  // comparison
  int err = 0;
  float thres = 0.001;
  for (int i = 0; i < I; i++) 
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
        if (fabs(C_golden[i][j][k] - C[i][j][k]) > thres) {
          err++;
        }
      }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
}


================================================
FILE: autosa_tests/large/ttm/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
//#define I 256
//#define J 256
//#define K 256
//#define L 256

#define I 264
#define J 256 
#define K 256 
#define L 256 


================================================
FILE: autosa_tests/large/ttm/simd_info.json
================================================
{
  "kernel4": {
    "reduction": ["y"]
  },
  "kernel5": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/large/ttmc/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/large/ttmc/README.md
================================================
# Chain of Tensor-matrix multiplications (TTMc)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/large/ttmc/kernel.c
autosa_tests/large/ttmc/kernel.h
autosa_tests/large/ttmc/simd_info.json
autosa_tests/large/ttmc/Makefile
autosa_tests/large/ttmc/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/large/ttmc/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[16,64,16,32];kernel[]->latency[1,8,8];kernel[]->simd[8,1]}" --simd-info=./autosa_tests/large/ttmc/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/large/ttmc/Makefile autosa.tmp/output/
cp autosa_tests/large/ttmc/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/large/ttmc/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]
sp=kernel0_1.D:DDR[3]


================================================
FILE: autosa_tests/large/ttmc/kernel.c
================================================
/*
 * This code implements the Chain of Tensor-matrix multiplications (TTMc), which performs:
 * D(i,j,k) += A(i,l,m) * B(l,j) * C(m,k)
 * Input: A[I][L][M], B[L][J], C[M][K]
 * Output: D[I][J][K]
 */

#include "kernel.h"

int main(int argc, char **argv){
  // declarations
  static data_t A[I][L][M];
  static data_t B[L][J];
//  static data_t C[M][K];
  static data_t C[K][M];
  static data_t D[I][J][K];
  static data_t D_golden[I][J][K];

  // data initialization
  for (int i = 0; i < I; i++)
    for (int l = 0; l < L; l++) 
      for (int m = 0; m < M; m++) {
        A[i][l][m] = 2.5;
      }
  for (int l = 0; l < L; l++)
    for (int j = 0; j < J; j++) {
      B[l][j] = 2.5;
    }
  for (int m = 0; m < M; m++)
    for (int k = 0; k < K; k++) {
//      C[m][k] = 2.5;
      C[k][m] = 2.5;
    }
  
  // computation
#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
        D[i][j][k] = 0;        
        for (int l = 0; l < L; l++) 
          for (int m = 0; m < M; m++) {
//            D[i][j][k] = D[i][j][k] + A[i][l][m] * B[l][j] * C[m][k];
            D[i][j][k] = D[i][j][k] + A[i][l][m] * B[l][j] * C[k][m];
          }
      }    
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
        D_golden[i][j][k] = 0;        
        for (int l = 0; l < L; l++) 
          for (int m = 0; m < M; m++) {
//            D_golden[i][j][k] += A[i][l][m] * B[l][j] * C[m][k];
            D_golden[i][j][k] += A[i][l][m] * B[l][j] * C[k][m];
          }
      }    

  // comparison
  int err = 0;
  float thres = 0.001;
  for (int i = 0; i < I; i++) 
    for (int j = 0; j < J; j++) 
      for (int k = 0; k < K; k++) {
        if (fabs(D_golden[i][j][k] - D[i][j][k]) > thres) {
          err++;
        }
      }

  if (err) {
    printf("Test failed with %d errors!\n", err);
    return -1;
  } else {
    printf("Test passed!\n");
    return 0;
  }
}


================================================
FILE: autosa_tests/large/ttmc/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
#define I 128 
#define J 128 
#define K 128 
#define L 128 
#define M 128 


================================================
FILE: autosa_tests/large/ttmc/simd_info.json
================================================
{
  "kernel4": {
    "reduction": ["y", "y"]
  }
}


================================================
FILE: autosa_tests/large/ttmc/step1-run-hls.tcl
================================================
open_project kernel0
set_top kernel0
add_files "src/kernel_kernel.cpp"
#add_files -tb PATH_TO_TESTBENCH_FILE

open_solution solution

#u250
set_part xcu250-figd2104-2L-e

# u280
#set_part xcu280-fsvh2892-2L-e

# 300 MHz
create_clock -period 3.333

config_dataflow -strict_mode warning
set_clock_uncertainty 27.000000%
config_rtl -enable_maxiConservative=1
config_interface -m_axi_addr64

# to enable integration with Vitis
config_sdx -target xocc

#csim_design
csynth_design
close_project
exit


================================================
FILE: autosa_tests/large/ttmc/step2-autobridge.py
================================================
#! /usr/bin/python3.6

# add the path to where you place the autobridge source code
import sys
sys.path.append('../src')

import graph
from formator import FormatHLS
import collections
import os
import subprocess

"""
AutoBridge divides the target device as follows and assign each HLS function to one slot
For more details pls refer to the paper

      u250                     u280
   -----------
 3 |    |    |
   |----|----|              |----|----|
 2 |    |    |            2 |    |    |
   |----|----|              |----|----|
 1 |    |    |            1 |    |    |
   |----|----|              |----|----|
 0 |    |    |            0 |    |    |
   -----------              -----------
     0    1                   0    1
"""

################### Modify Accordingly ###############################

# (1) fill basic information
project_path = '/home/jaywang/doc_examples/ttmc_ab/kernel0' # path to your hls project
top_name = 'kernel0' # name of the top function in your hls design
solution_path = f'{project_path}/solution/'
project_name = 'kernel0'
board_name = 'u250' # or 'u280'
# where the results will be saved. Your HLS project will be copied there and your top RTL will be replaced.
# Note that if the directory already exists, we will try to reset the contents

# (2) specify how your designs connect to the external memory
""" Example:

void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
{
  #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
  #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B

  load_p1 (p1, ...);
  load_p2 (p2, ...);
}

--------------------------------------

In this example, the pointer p1 and p2 will become M_AXI controllers to connect to the dedicated DDR IP.
If you want p1 to connect to DDR 2 in the 2-nd SLR, then you need to specify that the corresponding RTL controller must be floorplanned at the 2-nd SLR
Meanwhile, your function load_p1() will talk to the M_AXI controller also through AXI interface which cannot be easily pipelined.
Thus the RTL module corresponds to load_p1() must also be in the 2-nd SLR in this example.
Since load_p1() will communicate with the rest of your design using FIFO interface, you don't need to specify the location of other modules

(transparent)|                        (user visible)
             |
   Vitis     |                    what your HLS design becomes
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p1)                       (load_p1)
             |
             | M_AXI                     AXI                        FIFO
DDR IP  <--- | ----> M_AXI controller <-------> your first module <-------> your other modules
(fixed loc)  |         (p2)                       (load_p2)
             |
             | S_AXI
PCIe    <--- | ----> S_AXI controller
             |
"""

# on the left side or the right side of an SLR
DDR_loc_2d_x = collections.defaultdict(dict)

# on which SLR
DDR_loc_2d_y = collections.defaultdict(dict)

# use DDR 0, 1, 2, 3
DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

DDR_loc_2d_y['C_IO_L3_in_serialize_U0'] = 2
DDR_loc_2d_x['C_IO_L3_in_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 2
DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

DDR_loc_2d_y['D_drain_IO_L3_out_serialize_U0'] = 3
DDR_loc_2d_x['D_drain_IO_L3_out_serialize_U0'] = 0
DDR_loc_2d_y['kernel0_gmem_D_m_axi_U'] = 3
DDR_loc_2d_x['kernel0_gmem_D_m_axi_U'] = 0

DDR_loc_2d_y['kernel0_control_s_axi_U'] = 0

# (3) specify DDR information
# If you instantiate a DDR controller, it will consume non-trivial amount of resource
# to make the floorplanning better, you need to specify which DDRs have been enabled
# In this example, you connect p1 to DDR-2 in SLR-2 and p2 to DDR-1 in SLR-1
# If you want to use all DDRs, for example, you need to set it as [1, 1, 1, 1]
DDR_enable = [1, 1, 1, 1]

# (4) specify how much resource can be used in each slot
# In this way you could force the design to be placed evenly across the device and avoid local congestion
""" Example:
   -----------
 3 |0.76|0.62|
   |----|----|
 2 |0.74|0.61|
   |----|----|
 1 |0.75|0.6 |
   |----|----|
 0 | 0.7|0.6 |
   -----------
     0    1
"""
max_usage_ratio_2d = [ [0.85, 0.7], [0.85, 0.7], [0.85, 0.7], [0.85, 0.7] ]


##################### DON'T TOUCH THE SECTION BELOW #################################
target_dir = '/home/jaywang/doc_examples/ttmc_ab/autobridge'

formator = FormatHLS(
  rpt_path = f'{solution_path}/syn/report/',
  hls_sche_path = f'{solution_path}/.autopilot/db/',
  top_hdl_path = f'{solution_path}/syn/verilog/{top_name}_{top_name}.v',
  top_name = top_name,
  DDR_loc_2d_x = DDR_loc_2d_x,
  DDR_loc_2d_y = DDR_loc_2d_y,
  DDR_enable = DDR_enable,
  max_usage_ratio_2d = max_usage_ratio_2d,
  board_name = board_name,
  target_dir = target_dir,
  relay_station_count = lambda x : 2 * x, # how many levels of relay stations to add for x-unit of crossing
  max_search_time = 600,
  NaiveBalance = True)

# run floorplanning
g = graph.Graph(formator)

# move results to target dir
if (os.path.isdir(target_dir)):
  subprocess.run(['rm', '-rf', f'{target_dir}'])
subprocess.run(['mkdir', f'{target_dir}/'])
subprocess.run(['cp', '-r', project_path, f'{target_dir}/{project_name}'])
subprocess.run(['cp', os.path.realpath(__file__), f'{target_dir}/archived_source.txt'])
subprocess.run(['chmod', '+w', '-R', f'{target_dir}'])
subprocess.run(['cp', 'constraint.tcl', target_dir])
subprocess.run(['cp', 'pack_xo.tcl', target_dir])
subprocess.run(['cp', 'autobridge.log', target_dir])
subprocess.run(['cp', f'{top_name}_{top_name}.v', f'{target_dir}/{project_name}/solution/syn/verilog/'])

# clean up
os.system('rm *.lp')
subprocess.run(['rm', 'parser.out'])
subprocess.run(['rm', 'parsetab.py'])
subprocess.run(['rm', '-rf', '__pycache__'])


================================================
FILE: autosa_tests/large/ttmc/step3-pack-xo.tcl
================================================
open_project kernel0
open_solution solution
export_design -rtl verilog -format ip_catalog -xo kernel0.xo

close_project
puts "Pack XO successfully"
exit


================================================
FILE: autosa_tests/large/ttmc/step4-run-vitis.sh
================================================
OUTPUT_DIR="$(pwd)/vitis_run"

# name of the top function
TOP=kernel0

# choose the target device
PLATFORM=xilinx_u250_xdma_201830_2 
#PLATFORM=xilinx_u280_xdma_201920_3 

XO="$(pwd)/kernel0.xo"

# For different approaches see UG904-vivado-implementation
STRATEGY="Default" 
#STRATEGY="EarlyBlockPlacement" 

# remove the unused '--connectivity.sp' option for v++ if some DDRs are not used 
# Example: if we map p1 to DDR 3 and p2 to DDR 0
#
# void kernel0(ap_uint<512> *p1, ap_uint<512> *p2)
# {
#   #pragma HLS INTERFACE m_axi port=p1 offset=slave bundle=gmem_A
#   #pragma HLS INTERFACE m_axi port=p2 offset=slave bundle=gmem_B
# 
#   load_p1 (p1, ...);
#   load_p2 (p2, ...);
# }
#
# ARG_FOR_DDR_0=p2
# ARG_FOR_DDR_3=p1
# Should remove '--connectivity.sp' for DDR1 and DDR2

ARG_FOR_DDR_1=A
ARG_FOR_DDR_2=B
ARG_FOR_DDR_3=C
ARG_FOR_DDR_4=D

# the constraint file containing the floorplan results
# WARNING: must use absolute address
CONSTRAINT="$(pwd)/constraint.tcl"
if [ ! -f "$CONSTRAINT" ]; then
    echo "no constraint file found"
    exit
fi

v++ \
  --link \
  --output "${OUTPUT_DIR}/${TOP}_${PLATFORM}.xclbin" \
  --kernel ${TOP} \
  --platform ${PLATFORM} \
  --target hw \
  --report_level 2 \
  --temp_dir "${OUTPUT_DIR}/${TOP}_${PLATFORM}.temp" \
  --optimize 3 \
  --connectivity.nk ${TOP}:1:${TOP}_1 \
  --max_memory_ports ${TOP} \
  --save-temps \
  ${XO} \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_3}:DDR[2] \
  --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \
  --kernel_frequency 300 \
  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT


================================================
FILE: autosa_tests/lu/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/lu/README.md
================================================
# LU Decomposition (Small)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/lu/kernel.c
autosa_tests/lu/kernel.h
autosa_tests/lu/simd_info.json
autosa_tests/lu/Makefile
autosa_tests/lu/connectivity.cfg
```

__Command__:
```bash
./autosa ./autosa_tests/lu/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[-1,-1,-1];kernel[]->latency[]}" --simd-info=./autosa_tests/lu/simd_info.json --use-cplusplus-template --no-reschedule --live-range-reordering
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/lu/Makefile autosa.tmp/output/
cp autosa_tests/lu/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/lu/add_batch.py
================================================
import argparse

def run(input_f, output_f, batch):
    new_lines = []
    with open(input_f, 'r') as f:
        lines = f.readlines()
    inside_module = False
    inside_inner_module = False
    var_decl = 0
    add_loop = False
    #for line in lines:
    for line in lines:
        if line == '}\n' and add_loop:
            if inside_module and not inside_inner_module:
                new_lines.append(f'  }}\n')
        new_lines.append(line)
        if line.find('Module Definition') != -1:
            inside_module = not inside_module
            if not inside_module:
                inside_inner_module = False
                var_decl = 0
                add_loop = False
        if inside_module:
            if line.find('intra_trans(') != -1 or \
               line.find('inter_trans(') != -1 or \
               line.find('inter_trans_boundary(') != -1:
               inside_inner_module = True
        if inside_module and not inside_inner_module:
            if line.find('Variable Declaration') != -1:
                var_decl += 1
            if var_decl == 2:
                # Insert the batch loop here            
                new_lines.append(f'  for (int bn = 0; bn < {batch}; bn++) {{\n')                
                add_loop = True
                var_decl = 0

    with open(output_f, 'w') as f:
        f.writelines(new_lines)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Add batch loops into the code")
    parser.add_argument('-i', required=True, help='intput kernel file')
    parser.add_argument('-b', required=True, help='batch num')
    parser.add_argument('-o', required=True, help='output kernel file')

    args = parser.parse_args()
    run(args.i, args.o, args.b)

================================================
FILE: autosa_tests/lu/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/lu/kernel.c
================================================
#include "kernel.h"

void init_array(data_t A[N][N])
{
  int i, j;

  for (i = 0; i < N; i++)
  {
    for (j = 0; j <= i; j++)
      A[i][j] = (data_t)(-j % N) / N + 1;
    for (j = i + 1; j < N; j++) {
      A[i][j] = 0;
    }
    A[i][i] = 1;
  }

  /* Make the matrix positive semi-definite. */
  /* not necessary for LU, but using same code as cholesky */
  int r, s, t;
  data_t B[N][N];
  for (r = 0; r < N; r++)
    for (s = 0; s < N; s++) 
      B[r][s] = 0;
  for (t = 0; t < N; t++)
    for (r = 0; r < N; r++)
      for (s = 0; s < N; s++)
        B[r][s] += A[r][t] * A[s][t];
  for (r = 0; r < N; r++)        
    for (s = 0; s < N; s++)
      A[r][s] = B[r][s];
}

void lu_cpu(data_t A[N][N], data_t L[N][N], data_t U[N][N]) {
  data_t prev_V[N][N][N];
  data_t V_tmp[N][N][N];
  data_t U_tmp[N][N][N];
  data_t L_tmp[N][N][N];

  for (int k = 0; k < N; k++)
    for (int j = k; j < N; j++)
      for (int i = k; i < N; i++) {
        if (k == 0)
          prev_V[i][j][k] = A[i][j];
        else
          prev_V[i][j][k] = V_tmp[i][j][k - 1];
        
        if (j == k) {
          U_tmp[i][j][k] = prev_V[i][j][k];
          U[j][i] = U_tmp[i][j][k];
        } else {
          U_tmp[i][j][k] = U_tmp[i][j - 1][k];

          if (i == k) {            
            L_tmp[i][j][k] = prev_V[i][j][k] / U_tmp[i][j - 1][k]; // final
            L[i][j] = L_tmp[i][j][k];
          } else {
            L_tmp[i][j][k] = L_tmp[i - 1][j][k];
          }
          V_tmp[i][j][k] = prev_V[i][j][k] - L_tmp[i][j][k] * U_tmp[i][j - 1][k];
        }
      }  
}

void lu_device(data_t A[N][N], data_t L[N][N], data_t U[N][N])
{
#pragma scop
  {
    data_t prev_V[N][N];  
    data_t V[N][N];
    data_t U_tmp[N][N];
    data_t L_tmp[N][N];

    for (int k = 0; k < N; k++) {    
      for (int j = k; j < N; j++)
        for (int i = k; i < N; i++) {
          if (k == 0)
            prev_V[i][j] = A[i][j];
          else
            prev_V[i][j] = V[i][j];          

          if (j == k) {          
            U_tmp[i][j] = prev_V[i][j]; 
            U[j][i] = U_tmp[i][j]; // final
          } else {          
            U_tmp[i][j] = U_tmp[i][j - 1];        

            if (i == k) {
              L_tmp[i][j] = prev_V[i][j] / U_tmp[i][j]; 
              L[i][j] = L_tmp[i][j]; // final
            } else {            
              L_tmp[i][j] = L_tmp[i - 1][j];
            }          
          
            V[i][j] = prev_V[i][j] - L_tmp[i][j] * U_tmp[i][j];
          }

        }
    }
  }
#pragma endscop
}

int main(int argc, char **argv) {
  data_t A[N][N], L[N][N], U[N][N], L_golden[N][N], U_golden[N][N];

  init_array(A);
  for (int i = 0; i < N; i++)
    for (int j = 0; j < N; j++) {
      L[i][j] = 0;
      U[i][j] = 0;
      L_golden[i][j] = 0;
      U_golden[i][j] = 0;
    }
    
  lu_device(A, L, U);
  lu_cpu(A, L_golden, U_golden);

  int err = 0;
  for (int i = 0; i < N; i++)
    for (int j = 0; j <= i; j++) {
      if (fabs((float)L_golden[i][j] - (float)L[i][j]) > 0.001)
        err++;
    }
  for (int i = 0; i < N; i++)
    for (int j = i; j < N; j++) {
      if (fabs((float)U_golden[i][j] - (float)U[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  printf("A:\n");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) 
      printf("%f ", A[i][j]);
    printf("\n");
  }

  printf("L_golden:\n");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {      
      printf("%f ", (i == j)? 1.0 : L_golden[j][i]);      
    }
    printf("\n");
  }

  printf("U_golden:\n");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", U_golden[i][j]);
    }
    printf("\n");
  }

  printf("L:\n");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {      
      printf("%f ", (i == j)? 1.0 : (j < i)? L[j][i] : 0.0);      
    }
    printf("\n");
  }

  printf("U:\n");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", (j < i)? 0.0 : U[i][j]);
    }
    printf("\n");
  }

  return 0;    
}


================================================
FILE: autosa_tests/lu/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
//#define N 3
#define N 32


================================================
FILE: autosa_tests/lu/simd_info.json
================================================
{
  "kernel3": {
    "reduction": ["n"]
  }
}


================================================
FILE: autosa_tests/mm/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/mm/README.md
================================================
# Matrix Multiplication (Small)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/mm/kernel.c
autosa_tests/mm/kernel.h
autosa_tests/mm/simd_info.json
autosa_tests/mm/Makefile
autosa_tests/mm/connectivity.cfg
autosa_tests/mm/hls_script.tcl
```

__Command__:
To run the HLS flow for C/RTL simulation
```bash
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `hls_script.tcl` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/hls_script.tcl autosa.tmp/output/
```

Run the TCL script to build the HLS project.

```
cd autosa.tmp/output
vivado_hls -f hls_script.tcl
```

Alternatively, if you need to generate the bitstream for on-board testing, simply remove the `--hls` flag from the AutoSA command.
```bash
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/Makefile autosa.tmp/output/
cp autosa_tests/mm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
make check
```

__Other Test Cases__:
Below we provide some other test cases for you to try out.
1. 1D systolic array
```bash
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls
```

2. 2D systolic array
```bash
./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --local-reduce --reduce-op="+" --simd-touch-space
```


================================================
FILE: autosa_tests/mm/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/mm/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];  

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = (data_t)rand() / RAND_MAX;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = (data_t)rand() / RAND_MAX;      
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      //C[i][j] = 0;
      for (int k = 0; k < K; k++) {        
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
//#define I 256 
//#define J 264 
//#define K 256

//#define I 128 
//#define J 128 
//#define K 128

#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm/param_names.json
================================================
{
  "kernel0": ["i", "j", "k"],
  "kernel1": ["i", "j", "k"],
  "kernel2": ["i", "j", "k"],
  "kernel3": ["i", "j", "k"],
  "kernel4": ["i", "j", "k"],
  "kernel5": ["i", "j", "k"]
}


================================================
FILE: autosa_tests/mm/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  },
  "kernel5": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_block_sparse/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/mm_block_sparse/README.md
================================================
# Matrix Multiplication with Block Sparsity (Small)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/mm_block_sparse/kernel.c
autosa_tests/mm_block_sparse/kernel.h
autosa_tests/mm_block_sparse/simd_info.json
autosa_tests/mm_block_sparse/Makefile
autosa_tests/mm_block_sparse/connectivity.cfg
autosa_tests/mm_block_sparse/hls_script.tcl
```

__Command__:
To run the HLS flow for C/RTL simulation
```bash
./autosa ./autosa_tests/mm_block_sparse/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" --simd-info=./autosa_tests/mm_block_sparse/simd_info.json --host-serialize --hls --block-sparse --block-sparse-ratio="{kernel[]->A[2,4]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `hls_script.tcl` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/hls_script.tcl autosa.tmp/output/
```

Run the TCL script to build the HLS project.

```
cd autosa.tmp/output
vivado_hls -f hls_script.tcl
```

Alternatively, if you need to generate the bitstream for on-board testing, simply remove the `--hls` flag from the AutoSA command.
```bash
./autosa ./autosa_tests/mm_block_sparse/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" --simd-info=./autosa_tests/mm_block_sparse/simd_info.json --host-serialize --block-sparse --block-sparse-ratio="{kernel[]->block_sparse[2,4]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/Makefile autosa.tmp/output/
cp autosa_tests/mm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
make check
```

__Tuning__(Alpha):

__Other Test Cases__:
Below we provide some other test cases for you to try out.
1. 
```bash
./autosa ./autosa_tests/mm_block_sparse/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" --simd-info=./autosa_tests/mm_block_sparse/simd_info.json --host-serialize --block-sparse --block-sparse-ratio="{kernel[]->block_sparse[3,8]}"
```

================================================
FILE: autosa_tests/mm_block_sparse/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/mm_block_sparse/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm_block_sparse/kernel.c
================================================
/* This example uses the block sparsity to compute the matrix multiplication C = A * B.
 * The matrix A is with block sparsity and the matrix B is dense.
 * For matrix A, every VEC_LEN elements are grouped into a vector.
 * Inside each vector, there are NON_ZERO_NUM non-zero elements.
 * The sparsity of the matrix A is computed as 1 - NON_ZERO_NUM / VEC_LEN.
 * We use the matrix A_s to store both the data and index of the sparse matrix A.
 * 
 * For each vector group, we use an unsigned char to record the relative position
 * of the non-zero element in the group.
 * At present, we assume the vector group size to be a power of two and is no greater than 8.
 * Then every NON_ZERO_NUM non-zero elements and their index are grouped together and 
 * store in the A_s. 
 * However, to make the data structure aligned, we will also pad this group if necessary.
 * For example, if the group size VEC_LEN is 8, and NON_ZERO_NUM is 4, we will concatenate the 
 * index right after the first 4 data elements, resulting in 5 elements. 
 * Furthermore, we will pad this group and extend it to 8 elements. 
 * In this case, the effective storage for matrix A is the same with the unsparsified one.
 * If the group size VEC_LEN is 8, and NON_ZERO_NUM is 3, we will concatenate the 
 * index after the first 3 elements, resulting in 4 elements. No further padding is needed.
 * The effective storage compression ratio for matrix A is 8/4 = 2x for this example.
 * In summary, we denote the number of elements other than the data elements as META_DATA_NUM.
 * And it can be computed as:
 * META_DATA_NUM = 2^{ceil(log2(NON_ZERO_NUM + 1))} - NON_ZERO_NUM
 */
#include "kernel.h"

int main(int argc, char **argv) {
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  data_t A_d[I][K / COMPRESS_RATIO];
  unsigned char A_i[I][K / VEC_LEN];

  data_t A_s[I][K / EFF_COMPRESS_RATIO];

  /* Initialize the matrix */
  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = (data_t)rand() / RAND_MAX;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = (data_t)rand() / RAND_MAX;
    }

  /* Generate the random sparse matrix */
  for (int i = 0; i < I; i++)
    for (int k = 0; k < K / VEC_LEN; k++) {
      unsigned char offset = 0;
      int n = 0;
      while (n < NON_ZERO_NUM) {      
        int pos = rand() % VEC_LEN;
        /* Check if this position is already inserted */        
        unsigned char cur_mask = offset & (1 << pos);
        if (cur_mask) {
          continue;
        }
        offset = offset | (1 << pos);
        n++;
      }
      A_i[i][k] = offset;

      int pos = 0;
      int non_zero_pos = 0;
      while (pos < VEC_LEN) {
        unsigned char cur_mask = offset & (1 << pos);
        if (cur_mask) {
          A_d[i][k * NON_ZERO_NUM + non_zero_pos] = A[i][k * VEC_LEN + pos];
          non_zero_pos++;
        }
        pos++;
      }      
    }

  /* Generate the matrix to store both the sparse data and index */
  for (int i = 0; i < I; i++)
    for (int k = 0; k < K / VEC_LEN; k++) {
      int n;
      for (n = 0; n < NON_ZERO_NUM; n++) {
        A_s[i][k * (NON_ZERO_NUM + META_DATA_NUM) + n] = A_d[i][k * NON_ZERO_NUM + n];
      }
      unsigned char offset = A_i[i][k];
      union {data_t d; unsigned char c;} u;
      u.c = offset;
      A_s[i][k * (NON_ZERO_NUM + META_DATA_NUM) + n] = u.d;
    }

  /* For polyheral analysis */
#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  /* The actual computation */
//  for (int i = 0; i < I; i++)  
//    for (int j = 0; j < J; j++) {
//      C[i][j] = 0;
//      for (int k = 0; k < K / VEC_LEN; k++) {
//        /* Extract the non zero offset */
//        int offset[NON_ZERO_NUM];
//        unsigned char mask = A_i[i][k];
//        int pos = 0;
//        int non_zero_pos = 0;
//        while (pos < VEC_LEN) {
//          unsigned char cur_mask = mask & (1 << pos);
//          if (cur_mask) {
//            offset[non_zero_pos] = pos;
//            non_zero_pos++;
//          }
//          pos++;
//        }
//        for (int n = 0; n < NON_ZERO_NUM; n++) {
//          C[i][j] += A_d[i][k * NON_ZERO_NUM + n] * B[j][k * VEC_LEN + offset[n]];
//        }
//      }
//    }

  /* Compute the golden reference */
  for (int i = 0; i < I; i++)  
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K / VEC_LEN; k++) {
        /* Extract the non zero offset */
        int offset[NON_ZERO_NUM];
        unsigned char mask = A_i[i][k];
        int pos = 0;
        int non_zero_pos = 0;
        while (pos < VEC_LEN) {
          unsigned char cur_mask = mask & (1 << pos);
          if (cur_mask) {
            offset[non_zero_pos] = pos;
            non_zero_pos++;
          }
          pos++;
        }
        for (int n = 0; n < NON_ZERO_NUM; n++) {
          C_golden[i][j] += A_d[i][k * NON_ZERO_NUM + n] * B[j][k * VEC_LEN + offset[n]];
        }
      }
    }  

  /* Compare the results */
  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_block_sparse/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 64
#define J 64
#define K 64

//#define VEC_LEN 4
//#define NON_ZERO_NUM 3
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

#define VEC_LEN 4
#define NON_ZERO_NUM 2
#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
#define META_DATA_NUM 2
#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

//#define VEC_LEN 4
//#define NON_ZERO_NUM 1
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

//#define VEC_LEN 8
//#define NON_ZERO_NUM 4
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 4
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

//#define VEC_LEN 8
//#define NON_ZERO_NUM 3
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 1
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))

//#define VEC_LEN 8
//#define NON_ZERO_NUM 2
//#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)
//#define META_DATA_NUM 2
//#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))


================================================
FILE: autosa_tests/mm_block_sparse/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_catapult/README.md
================================================
# Matrix Multiplication (Small)

Board        | Software Version
-------------|-----------------
N/A | Mentor Graphics Catapult Ultra 10.5c

__Files__:
```
autosa_tests/mm_catapult/kernel.c
autosa_tests/mm_catapult/kernel.h
autosa_tests/mm_catapult/simd_info.json
```

__Command__:
This project shows the example of using Catapult HLS to generate FPGA designs.

To generate the input code for Catapult HLS, use the command below.
```bash
./autosa ./autosa_tests/mm_catapult/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_catapult_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`.
Catapult HLS requires the GUI or TCL to perform the hardware optimization. AutoSA generates an example TCL flow named `kernel_directives.tcl` that can be found in the directory `autosa.tmp/output`.

There are several limitations for the current Catapult HLS flow.
1. Floating point is not supported. We currently supported unsigned short and unsigned int.
2. In order to achieve II=1, programmers need to provide additional dependence information in the TCL file.
3. To successfully pass the C simulation, Catapult HLS requires the use of guards for input fifos. At present, programmers are required to add the guards manually.

Catapult HLS will generate RTL which can be synthesized on the target FPGAs.

================================================
FILE: autosa_tests/mm_catapult/directives.tcl
================================================
solution new -state initial
solution options defaults
solution options set /Input/CppStandard c++11
solution options set /Output/GenerateCycleNetlist false
solution options set /Flows/SCVerify/USE_CCS_BLOCK true
solution file add ../../research/autosa/AutoSA/autosa.tmp/output/src/kernel_kernel_hw.h -type CHEADER
solution file add ../../research/autosa/AutoSA/autosa.tmp/output/src/kernel_kernel.h -type CHEADER
solution file add ../../research/autosa/AutoSA/autosa.tmp/output/src/kernel.h -type CHEADER
solution file add ../../research/autosa/AutoSA/autosa.tmp/output/src/kernel_host.cpp -type C++
directive set -PIPELINE_RAMP_UP true
directive set -PROTOTYPING_ENGINE oasys
directive set -CLUSTER_TYPE combinational
directive set -CLUSTER_FAST_MODE false
directive set -CLUSTER_RTL_SYN false
directive set -CLUSTER_OPT_CONSTANT_INPUTS true
directive set -CLUSTER_ADDTREE_IN_COUNT_THRESHOLD 0
directive set -CLUSTER_ADDTREE_IN_WIDTH_THRESHOLD 0
directive set -ROM_THRESHOLD 64
directive set -PROTOTYPE_ROM true
directive set -CHARACTERIZE_ROM false
directive set -OPT_CONST_MULTS use_library
directive set -CLOCK_OVERHEAD 20.000000
directive set -RESET_CLEARS_ALL_REGS use_library
directive set -START_FLAG {}
directive set -READY_FLAG {}
directive set -DONE_FLAG {}
directive set -TRANSACTION_DONE_SIGNAL true
directive set -STALL_FLAG false
directive set -IDLE_SIGNAL {}
directive set -REGISTER_IDLE_SIGNAL false
directive set -ARRAY_SIZE 1024
directive set -CHAN_IO_PROTOCOL use_library
directive set -IO_MODE super
directive set -UNROLL no
directive set -REALLOC true
directive set -MUXPATH true
directive set -TIMING_CHECKS true
directive set -ASSIGN_OVERHEAD 0
directive set -REGISTER_SHARING_LIMIT 0
directive set -REGISTER_SHARING_MAX_WIDTH_DIFFERENCE 8
directive set -SAFE_FSM false
directive set -NO_X_ASSIGNMENTS true
directive set -REG_MAX_FANOUT 0
directive set -FSM_BINARY_ENCODING_THRESHOLD 64
directive set -FSM_ENCODING none
directive set -LOGIC_OPT false
directive set -MEM_MAP_THRESHOLD 32
directive set -REGISTER_THRESHOLD 256
directive set -MERGEABLE true
directive set -SPECULATE true
directive set -DESIGN_GOAL area
go new
solution library add mgc_Xilinx-VIRTEX-uplus-2LV_beh -- -rtlsyntool Vivado -manufacturer Xilinx -family VIRTEX-uplus -speed -2LV -part xcvu11p-flga2577-2LV-e
solution library add Xilinx_RAMS
solution library add Xilinx_ROMS
solution library add amba
solution library add ccs_fpga_hic
solution library add Xilinx_FIFO
go libraries
directive set -CLOCKS {clk {-CLOCK_PERIOD 5.0 -CLOCK_EDGE rising -CLOCK_UNCERTAINTY 0.0 -CLOCK_HIGH_TIME 2.5 -RESET_SYNC_NAME rst -RESET_ASYNC_NAME arst_n -RESET_KIND sync -RESET_SYNC_ACTIVE high -RESET_ASYNC_ACTIVE low -ENABLE_ACTIVE high}}
go assembly
directive set -FIFO_DEPTH 1
directive set /kernel0/A_IO_L2_in_intra_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/A_IO_L2_in_inter_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/A_IO_L2_in_inter_trans_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/A_IO_L2_in/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/A_IO_L2_in_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/B_IO_L2_in_intra_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/B_IO_L2_in_inter_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/B_IO_L2_in_inter_trans_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/B_IO_L2_in/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/B_IO_L2_in_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/PE/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/PE/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_intra_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_intra_trans/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_inter_trans/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_inter_trans/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_inter_trans_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_inter_trans_boundary/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L1_out_boundary/idy:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L2_out/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/C_drain_IO_L2_out_boundary/idx:rsc -MAP_TO_MODULE {[DirectInput]}
directive set /kernel0/A_IO_L2_in/A_IO_L2_in_local_A_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/A_IO_L2_in/A_IO_L2_in_local_A_inst:cns -STAGE_REPLICATION 2
directive set /kernel0/A_IO_L2_in/A_IO_L2_in_local_A_inst -WORD_WIDTH 256
directive set /kernel0/A_IO_L2_in_boundary/A_IO_L2_in_local_A_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/A_IO_L2_in_boundary/A_IO_L2_in_local_A_inst:cns -STAGE_REPLICATION 2
directive set /kernel0/A_IO_L2_in_boundary/A_IO_L2_in_local_A_inst -WORD_WIDTH 256
directive set /kernel0/B_IO_L2_in/B_IO_L2_in_local_B_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/B_IO_L2_in/B_IO_L2_in_local_B_inst:cns -STAGE_REPLICATION 2
directive set /kernel0/B_IO_L2_in/B_IO_L2_in_local_B_inst -WORD_WIDTH 256
directive set /kernel0/B_IO_L2_in_boundary/B_IO_L2_in_local_B_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/B_IO_L2_in_boundary/B_IO_L2_in_local_B_inst:cns -STAGE_REPLICATION 2
directive set /kernel0/B_IO_L2_in_boundary/B_IO_L2_in_local_B_inst -WORD_WIDTH 256
directive set /kernel0/C_drain_IO_L1_out/C_drain_IO_L1_out_local_C_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/C_drain_IO_L1_out/C_drain_IO_L1_out_local_C_inst:cns -STAGE_REPLICATION 1
directive set /kernel0/C_drain_IO_L1_out/C_drain_IO_L1_out_local_C_inst -WORD_WIDTH 64
directive set /kernel0/C_drain_IO_L1_out_boundary/C_drain_IO_L1_out_local_C_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL
directive set /kernel0/C_drain_IO_L1_out_boundary/C_drain_IO_L1_out_local_C_inst:cns -STAGE_REPLICATION 1
directive set /kernel0/C_drain_IO_L1_out_boundary/C_drain_IO_L1_out_local_C_inst -WORD_WIDTH 64
go architect
// Insert directives for dependence if necessary
// Example: directive set /kernel0/PE/run/for:read_mem(local_C:rsc.@) -IGNORE_DEPENDENCY_FROM {for:write_mem(local_C:rsc.@) for:write_mem(local_C:rsc.@)}
directive set /kernel0/PE/run/for#1:for:for:for:for#2:read_mem(local_C:rsc.@) -IGNORE_DEPENDENCY_FROM {for#1:for:for:for:for#2:write_mem(local_C:rsc.@)}
go allocate
go extract


================================================
FILE: autosa_tests/mm_catapult/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
  data_t A[I_P][K_P], B[J_P][K_P], C[I_P][J_P], C_golden[I_P][J_P]; // gemm0,3

  for (int i = 0; i < I_P; i++) 
    for (int k = 0; k < K_P; k++) {
      //A[i][k] = (data_t)rand() / RAND_MAX;
      A[i][k] = (data_t)1;
    }

  for (int j = 0; j < J_P; j++)
    for (int k = 0; k < K_P; k++) {
      //B[j][k] = (data_t)rand() / RAND_MAX;
      B[j][k] = (data_t)1;
    }

#pragma scop
  for (int i = 0; i < I_P; i++)
    for (int j = 0; j < J_P; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K_P; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I_P; i++)
    for (int j = 0; j < J_P; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K_P; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I_P; i++)
    for (int j = 0; j < J_P; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_catapult/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

//typedef float data_t;
typedef unsigned int data_t;
#define I_P 64
#define J_P 64
#define K_P 64


================================================
FILE: autosa_tests/mm_catapult/kernel_kernel_hw.h
================================================
#include "kernel_kernel.h"

struct A_IO_L2_in_local_A {
  A_t8 data[8][2];
};

struct B_IO_L2_in_local_B {
  B_t8 data[8][2];
};

struct C_drain_IO_L1_out_local_C {
  C_t2 data[8][4];
};

#include <mc_scverify.h>

/* Module Definition */
class A_IO_L3_in {
  public:
    A_IO_L3_in() {}
    #pragma hls_design interface
    #pragma hls_pipeline_init_interval 1
    void CCS_BLOCK(run)(ac_channel<A_t8> &fifo_A_serialize, ac_channel<A_t8> &fifo_A_local_out) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
            for (ac_int<2, false> c3 = 0; c3 <= 1; c3 += 1)
              for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1)
#endif
                {
                  // hls_pipeline
                {
                  A_t8 fifo_data;
                  fifo_data = fifo_A_serialize.read();
                  fifo_A_local_out.write(fifo_data);
                }
                }
    }
};
/* Module Definition */

/* Module Definition */
class A_IO_L3_in_serialize {
  public:
    A_IO_L3_in_serialize() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(A_t16 A[1024], ac_channel<A_t8> &fifo_A_local_out) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
#endif
      A_t8 fifo_data;
      A_t16 mem_data;
      #pragma hls_pipeline_init_interval 1
      for (ac_int<11, false> i = 0; i < 1024; i++) {
        mem_data = A[i];
        for (ac_int<2, false> p = 0; p < 2; p++) {
          fifo_data = mem_data.slc<256>(0);
          mem_data = mem_data >> 256;
          fifo_A_local_out.write(fifo_data);
        }
      }
    }
};
/* Module Definition */

/* Module Definition */
class A_IO_L2_in_intra_trans {
  public:
    A_IO_L2_in_intra_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<A_IO_L2_in_local_A> &local_A, ac_channel<A_t2> &fifo_A_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */


#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            A_IO_L2_in_local_A local_A_tmp;
            local_A_tmp = local_A.read();
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
              for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
                for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
                  // hls_pipeline
                  A_t2 fifo_data;
                  A_t8 buf_data;
                  A_t2 buf_data_split[4];
                  buf_data = local_A_tmp.data[c7][2 * c5 / 8];
                  buf_data_split[0] = buf_data.slc<64>(0);
                  buf_data_split[1] = buf_data.slc<64>(64);
                  buf_data_split[2] = buf_data.slc<64>(128);
                  buf_data_split[3] = buf_data.slc<64>(192);
                  int split_i = (c5) % 4;
                  fifo_data = buf_data_split[split_i];
                  fifo_A_local_out.write(fifo_data);
                }
          }
    }
};
/* Module Definition */

/* Module Definition */
class A_IO_L2_in_inter_trans {
  public:
    A_IO_L2_in_inter_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<A_IO_L2_in_local_A> &local_A, ac_channel<A_t8> &fifo_A_in, ac_channel<A_t8> &fifo_A_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            A_IO_L2_in_local_A local_A_tmp;
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1) {
              if (c3 == p0) {
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    A_t8 fifo_data;
                    fifo_data = fifo_A_in.read();
                    local_A_tmp.data[c4][c5] = fifo_data;
                  }
              } else {
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    A_t8 fifo_data;
                    fifo_data = fifo_A_in.read();
                    fifo_A_out.write(fifo_data);
                  }
              }
            }
            local_A.write(local_A_tmp);
          }
    }
};
/* Module Definition */

/* Module Definition */
class A_IO_L2_in_inter_trans_boundary {
  public:
    A_IO_L2_in_inter_trans_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<A_IO_L2_in_local_A> &local_A, ac_channel<A_t8> &fifo_A_in) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            A_IO_L2_in_local_A local_A_tmp;
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1)
              if (c3 == p0)
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    A_t8 fifo_data;
                    fifo_data = fifo_A_in.read();
                    local_A_tmp.data[c4][c5] = fifo_data;
                  }
            local_A.write(local_A_tmp);
          }
    }
};
/* Module Definition */

/* Module Definition */
class A_IO_L2_in {
  public:
    A_IO_L2_in() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<A_t8> &fifo_A_in, ac_channel<A_t8> &fifo_A_out, ac_channel<A_t2> &fifo_A_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

      A_IO_L2_in_inter_trans_inst.run(
        /* module id */ idx, 
        /* array */ A_IO_L2_in_local_A_inst, 
        /* fifo */ fifo_A_in, 
        /* fifo */ fifo_A_out
      );
      A_IO_L2_in_intra_trans_inst.run(
        /* module id */ idx, 
        /* array */ A_IO_L2_in_local_A_inst, 
        /* fifo */ fifo_A_local_out
      );
    }

  private:
    A_IO_L2_in_inter_trans A_IO_L2_in_inter_trans_inst;
    A_IO_L2_in_intra_trans A_IO_L2_in_intra_trans_inst;
    ac_channel<A_IO_L2_in_local_A> A_IO_L2_in_local_A_inst;
};
/* Module Definition */

/* Module Definition */
class A_IO_L2_in_boundary {
  public:
    A_IO_L2_in_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<A_t8> &fifo_A_in, ac_channel<A_t2> &fifo_A_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

      A_IO_L2_in_inter_trans_boundary_inst.run(
        /* module id */ idx, 
        /* array */ A_IO_L2_in_local_A_inst, 
        /* fifo */ fifo_A_in
      );
      A_IO_L2_in_intra_trans_inst.run(
        /* module id */ idx, 
        /* array */ A_IO_L2_in_local_A_inst, 
        /* fifo */ fifo_A_local_out
      );
    }

  private:
    A_IO_L2_in_inter_trans_boundary A_IO_L2_in_inter_trans_boundary_inst;
    A_IO_L2_in_intra_trans A_IO_L2_in_intra_trans_inst;
    ac_channel<A_IO_L2_in_local_A> A_IO_L2_in_local_A_inst;
};
/* Module Definition */

/* Module Definition */
class B_IO_L3_in {
  public:
    B_IO_L3_in() {}
    #pragma hls_design interface
    #pragma hls_pipeline_init_interval 1
    void CCS_BLOCK(run)(ac_channel<B_t8> &fifo_B_serialize, ac_channel<B_t8> &fifo_B_local_out) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
            for (ac_int<2, false> c3 = 0; c3 <= 1; c3 += 1)
              for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1)
#endif
                {
                  // hls_pipeline
                {
                  B_t8 fifo_data;
                  fifo_data = fifo_B_serialize.read();
                  fifo_B_local_out.write(fifo_data);
                }
                }
    }
};
/* Module Definition */

/* Module Definition */
class B_IO_L3_in_serialize {
  public:
    B_IO_L3_in_serialize() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(B_t16 B[1024], ac_channel<B_t8> &fifo_B_local_out) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
#endif
      B_t8 fifo_data;
      B_t16 mem_data;
      #pragma hls_pipeline_init_interval 1
      for (ac_int<11, false> i = 0; i < 1024; i++) {
        mem_data = B[i];
        for (ac_int<2, false> p = 0; p < 2; p++) {
          fifo_data = mem_data.slc<256>(0);
          mem_data = mem_data >> 256;
          fifo_B_local_out.write(fifo_data);
        }
      }
    }
};
/* Module Definition */

/* Module Definition */
class B_IO_L2_in_intra_trans {
  public:
    B_IO_L2_in_intra_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<B_IO_L2_in_local_B> &local_B, ac_channel<B_t2> &fifo_B_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */


#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            B_IO_L2_in_local_B local_B_tmp;
            local_B_tmp = local_B.read();
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
              for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
                for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
                  // hls_pipeline
                  B_t2 fifo_data;
                  B_t8 buf_data;
                  B_t2 buf_data_split[4];
                  buf_data = local_B_tmp.data[c6][2 * c5 / 8];
                  buf_data_split[0] = buf_data.slc<64>(0);
                  buf_data_split[1] = buf_data.slc<64>(64);
                  buf_data_split[2] = buf_data.slc<64>(128);
                  buf_data_split[3] = buf_data.slc<64>(192);
                  int split_i = (c5) % 4;
                  fifo_data = buf_data_split[split_i];
                  fifo_B_local_out.write(fifo_data);
                }
          }
    }
};
/* Module Definition */

/* Module Definition */
class B_IO_L2_in_inter_trans {
  public:
    B_IO_L2_in_inter_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<B_IO_L2_in_local_B> &local_B, ac_channel<B_t8> &fifo_B_in, ac_channel<B_t8> &fifo_B_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            B_IO_L2_in_local_B local_B_tmp;
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1) {
              if (c3 == p0) {
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    B_t8 fifo_data;
                    fifo_data = fifo_B_in.read();
                    local_B_tmp.data[c4][c5] = fifo_data;
                  }
              } else {
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    B_t8 fifo_data;
                    fifo_data = fifo_B_in.read();
                    fifo_B_out.write(fifo_data);
                  }
              }
            }
            local_B.write(local_B_tmp);
          }
    }
};
/* Module Definition */

/* Module Definition */
class B_IO_L2_in_inter_trans_boundary {
  public:
    B_IO_L2_in_inter_trans_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<B_IO_L2_in_local_B> &local_B, ac_channel<B_t8> &fifo_B_in) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
#endif
          {
            B_IO_L2_in_local_B local_B_tmp;
            // synth
            #pragma hls_pipeline_init_interval 1
            for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1)
              if (c3 == p0)
                for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
                  for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1) {
                    // hls_pipeline
                    B_t8 fifo_data;
                    fifo_data = fifo_B_in.read();
                    local_B_tmp.data[c4][c5] = fifo_data;
                  }
            local_B.write(local_B_tmp);
          }
    }
};
/* Module Definition */

/* Module Definition */
class B_IO_L2_in {
  public:
    B_IO_L2_in() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<B_t8> &fifo_B_in, ac_channel<B_t8> &fifo_B_out, ac_channel<B_t2> &fifo_B_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

      B_IO_L2_in_inter_trans_inst.run(
        /* module id */ idx, 
        /* array */ B_IO_L2_in_local_B_inst, 
        /* fifo */ fifo_B_in, 
        /* fifo */ fifo_B_out
      );
      B_IO_L2_in_intra_trans_inst.run(
        /* module id */ idx, 
        /* array */ B_IO_L2_in_local_B_inst, 
        /* fifo */ fifo_B_local_out
      );
    }

  private:
    B_IO_L2_in_inter_trans B_IO_L2_in_inter_trans_inst;
    B_IO_L2_in_intra_trans B_IO_L2_in_intra_trans_inst;
    ac_channel<B_IO_L2_in_local_B> B_IO_L2_in_local_B_inst;
};
/* Module Definition */

/* Module Definition */
class B_IO_L2_in_boundary {
  public:
    B_IO_L2_in_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<B_t8> &fifo_B_in, ac_channel<B_t2> &fifo_B_local_out) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

      B_IO_L2_in_inter_trans_boundary_inst.run(
        /* module id */ idx, 
        /* array */ B_IO_L2_in_local_B_inst, 
        /* fifo */ fifo_B_in
      );
      B_IO_L2_in_intra_trans_inst.run(
        /* module id */ idx, 
        /* array */ B_IO_L2_in_local_B_inst, 
        /* fifo */ fifo_B_local_out
      );
    }

  private:
    B_IO_L2_in_inter_trans_boundary B_IO_L2_in_inter_trans_boundary_inst;
    B_IO_L2_in_intra_trans B_IO_L2_in_intra_trans_inst;
    ac_channel<B_IO_L2_in_local_B> B_IO_L2_in_local_B_inst;
};
/* Module Definition */

/* Module Definition */
class PE {
  public:
    PE() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<A_t2> &fifo_A_in, ac_channel<A_t2> &fifo_A_out, ac_channel<B_t2> &fifo_B_in, ac_channel<B_t2> &fifo_B_out, ac_channel<C_t1> &fifo_C_drain_out) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      A_t1 local_A[1][2];
      B_t1 local_B[1][2];
      C_t1 local_C[8][8];
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          {
            #pragma hls_pipeline_init_interval 1
            for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
              for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
                // hls_unroll
                local_C[c7][c6] = 0;
              }
            #pragma hls_pipeline_init_interval 1
            for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
              for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
                  for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
                    {
                      A_t2 fifo_data;
                      fifo_data = fifo_A_in.read();
                      #pragma unroll yes
                      for (ac_int<2, false> n = 0; n < 2; n++) {
                        local_A[0][n] = (A_t1)fifo_data.slc<32>(0);
                        fifo_data = fifo_data >> 32;
                      }
                    }
                    {
                      B_t2 fifo_data;
                      fifo_data = fifo_B_in.read();
                      #pragma unroll yes
                      for (ac_int<2, false> n = 0; n < 2; n++) {
                        local_B[0][n] = (B_t1)fifo_data.slc<32>(0);
                        fifo_data = fifo_data >> 32;
                      }
                    }
                    #pragma unroll yes
                    for (ac_int<2, false> c8 = 0; c8 <= 1; c8 += 1)
                      local_C[c7][c6] = (local_C[c7][c6] + (local_A[0][c8] * local_B[0][c8]));
                    if (c2 == 3 && c5 == 7)
                      fifo_C_drain_out.write(local_C[c7][c6]);
                    {
                      B_t2 fifo_data;
                      fifo_data.set_slc(32, local_B[0][1]);
                      fifo_data.set_slc(0, local_B[0][0]);
                      fifo_B_out.write(fifo_data);
                    }
                    {
                      A_t2 fifo_data;
                      fifo_data.set_slc(32, local_A[0][1]);
                      fifo_data.set_slc(0, local_A[0][0]);
                      fifo_A_out.write(fifo_data);
                    }
                  }
          }
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L1_out_intra_trans {
  public:
    C_drain_IO_L1_out_intra_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<C_drain_IO_L1_out_local_C> &local_C, ac_channel<C_t1> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      /* Variable Declaration */


#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          C_drain_IO_L1_out_local_C local_C_tmp;
          // synth
          #pragma hls_pipeline_init_interval 1
          for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
            for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
              // hls_pipeline
              C_t1 fifo_data;
              C_t2 buf_data;
              C_t1 buf_data_split[2];
              buf_data = local_C_tmp.data[c7][c6 / 2];
              buf_data_split[0] = buf_data.slc<32>(0);
              buf_data_split[1] = buf_data.slc<32>(32);
              int split_i = (c6) % 2;
              fifo_data = fifo_C_drain_local_in.read();
              buf_data_split[split_i] = fifo_data;
                            buf_data.set_slc(0, buf_data_split[0]);
              buf_data.set_slc(32, buf_data_split[1]);

              local_C_tmp.data[c7][c6 / 2] = buf_data;
            }
          local_C.write(local_C_tmp);
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L1_out_inter_trans {
  public:
    C_drain_IO_L1_out_inter_trans() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<C_drain_IO_L1_out_local_C> &local_C, ac_channel<C_t2> &fifo_C_drain_in, ac_channel<C_t2> &fifo_C_drain_out) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          C_drain_IO_L1_out_local_C local_C_tmp;
          local_C_tmp = local_C.read();
          // synth
          #pragma hls_pipeline_init_interval 1
          for (ac_int<2, false> c4 = p1; c4 <= 1; c4 += 1) {
            if (c4 == p1) {
              for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                  // hls_pipeline
                  C_t2 fifo_data;
                  fifo_data = local_C_tmp.data[c5][c6];
                  fifo_C_drain_out.write(fifo_data);
                }
            } else {
              for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                  // hls_pipeline
                  C_t2 fifo_data;
                  fifo_data = fifo_C_drain_in.read();
                  fifo_C_drain_out.write(fifo_data);
                }
            }
          }
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L1_out_inter_trans_boundary {
  public:
    C_drain_IO_L1_out_inter_trans_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<C_drain_IO_L1_out_local_C> &local_C, ac_channel<C_t2> &fifo_C_drain_out) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          C_drain_IO_L1_out_local_C local_C_tmp;
          local_C_tmp = local_C.read();
          // synth
          #pragma hls_pipeline_init_interval 1
          for (ac_int<2, false> c4 = p1; c4 <= 1; c4 += 1)
            if (c4 == p1)
              for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                  // hls_pipeline
                  C_t2 fifo_data;
                  fifo_data = local_C_tmp.data[c5][c6];
                  fifo_C_drain_out.write(fifo_data);
                }
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L1_out {
  public:
    C_drain_IO_L1_out() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<C_t2> &fifo_C_drain_in, ac_channel<C_t2> &fifo_C_drain_out, ac_channel<C_t1> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      /* Variable Declaration */

      C_drain_IO_L1_out_intra_trans_inst.run(
        /* module id */ idx, 
        /* module id */ idy, 
        /* array */ C_drain_IO_L1_out_local_C_inst, 
        /* fifo */ fifo_C_drain_local_in
      );
      C_drain_IO_L1_out_inter_trans_inst.run(
        /* module id */ idx, 
        /* module id */ idy, 
        /* array */ C_drain_IO_L1_out_local_C_inst, 
        /* fifo */ fifo_C_drain_in, 
        /* fifo */ fifo_C_drain_out
      );
    }

  private:
    C_drain_IO_L1_out_inter_trans C_drain_IO_L1_out_inter_trans_inst;
    C_drain_IO_L1_out_intra_trans C_drain_IO_L1_out_intra_trans_inst;
    ac_channel<C_drain_IO_L1_out_local_C> C_drain_IO_L1_out_local_C_inst;
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L1_out_boundary {
  public:
    C_drain_IO_L1_out_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, int idy, ac_channel<C_t2> &fifo_C_drain_out, ac_channel<C_t1> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      /* Variable Declaration */

      C_drain_IO_L1_out_intra_trans_inst.run(
        /* module id */ idx, 
        /* module id */ idy, 
        /* array */ C_drain_IO_L1_out_local_C_inst, 
        /* fifo */ fifo_C_drain_local_in
      );
      C_drain_IO_L1_out_inter_trans_boundary_inst.run(
        /* module id */ idx, 
        /* module id */ idy, 
        /* array */ C_drain_IO_L1_out_local_C_inst, 
        /* fifo */ fifo_C_drain_out
      );
    }

  private:
    C_drain_IO_L1_out_inter_trans_boundary C_drain_IO_L1_out_inter_trans_boundary_inst;
    C_drain_IO_L1_out_intra_trans C_drain_IO_L1_out_intra_trans_inst;
    ac_channel<C_drain_IO_L1_out_local_C> C_drain_IO_L1_out_local_C_inst;
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L2_out {
  public:
    C_drain_IO_L2_out() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<C_t2> &fifo_C_drain_in, ac_channel<C_t2> &fifo_C_drain_out, ac_channel<C_t2> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          #pragma hls_pipeline_init_interval 1
          for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1) {
            if (c3 == p0) {
              for (ac_int<2, false> c4 = 0; c4 <= 1; c4 += 1)
                for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                  for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                    // hls_pipeline
                    C_t2 fifo_data;
                    fifo_data = fifo_C_drain_local_in.read();
                    fifo_C_drain_out.write(fifo_data);
                  }
            } else {
              for (ac_int<2, false> c4 = 0; c4 <= 1; c4 += 1)
                for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                  for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                    // hls_pipeline
                    C_t2 fifo_data;
                    fifo_data = fifo_C_drain_in.read();
                    fifo_C_drain_out.write(fifo_data);
                  }
            }
          }
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L2_out_boundary {
  public:
    C_drain_IO_L2_out_boundary() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(int idx, ac_channel<C_t2> &fifo_C_drain_out, ac_channel<C_t2> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      int p0 = idx; // module id
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
#endif
        {
          #pragma hls_pipeline_init_interval 1
          for (ac_int<2, false> c3 = p0; c3 <= 1; c3 += 1)
            if (c3 == p0)
              for (ac_int<2, false> c4 = 0; c4 <= 1; c4 += 1)
                for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                  for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1) {
                    // hls_pipeline
                    C_t2 fifo_data;
                    fifo_data = fifo_C_drain_local_in.read();
                    fifo_C_drain_out.write(fifo_data);
                  }
        }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L3_out {
  public:
    C_drain_IO_L3_out() {}
    #pragma hls_design interface
    #pragma hls_pipeline_init_interval 1
    void CCS_BLOCK(run)(ac_channel<C_t2> &fifo_C_drain_serialize, ac_channel<C_t2> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
      for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
        for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
          for (ac_int<2, false> c3 = 0; c3 <= 1; c3 += 1)
            for (ac_int<2, false> c4 = 0; c4 <= 1; c4 += 1)
              for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
                for (ac_int<3, false> c6 = 0; c6 <= 3; c6 += 1)
#endif
                {
                  // hls_pipeline
                {
                  C_t2 fifo_data;
                  fifo_data = fifo_C_drain_local_in.read();
                  fifo_C_drain_serialize.write(fifo_data);
                }
                }
    }
};
/* Module Definition */

/* Module Definition */
class C_drain_IO_L3_out_serialize {
  public:
    C_drain_IO_L3_out_serialize() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(C_t16 C[256], ac_channel<C_t2> &fifo_C_drain_local_in) {
      /* Variable Declaration */
      /* Variable Declaration */

#ifndef __SYNTHESIS__
      // while () // Please add the fifo check for C sim.
#endif
      #pragma hls_pipeline_init_interval 1
      for (ac_int<9, false> i = 0; i < 256; i++) {
        C_t2 fifo_data;
        C_t16 mem_data;
        C_t2 mem_data_split[8];
        for (ac_int<4, false> p = 0; p < 8; p++) {
          fifo_data = fifo_C_drain_local_in.read();
          mem_data_split[p] = fifo_data;
        }
        mem_data.set_slc(0, mem_data_split[0]);
        mem_data.set_slc(64, mem_data_split[1]);
        mem_data.set_slc(128, mem_data_split[2]);
        mem_data.set_slc(192, mem_data_split[3]);
        mem_data.set_slc(256, mem_data_split[4]);
        mem_data.set_slc(320, mem_data_split[5]);
        mem_data.set_slc(384, mem_data_split[6]);
        mem_data.set_slc(448, mem_data_split[7]);
        C[i] = mem_data;
      }
    }
};
/* Module Definition */

#pragma hls_design top
class kernel0 {
  public:
    kernel0() {}
    #pragma hls_design interface
    void CCS_BLOCK(run)(A_t16 A[16384 / 16], B_t16 B[16384 / 16], C_t16 C[4096 / 16])
    {
      /* Module Call */
      A_IO_L3_in_serialize_inst.run(
        /* array */ A,
        /* fifo */ fifo_A_A_IO_L3_in_serialize
      );
      /* Module Call */

      /* Module Call */
      A_IO_L3_in_inst.run(
        /* fifo */ fifo_A_A_IO_L3_in_serialize,
        /* fifo */ fifo_A_A_IO_L2_in_0
      );
      /* Module Call */

      /* Module Call */
      A_IO_L2_in_inst_0.run(
        /* module id */ 0,
        /* fifo */ fifo_A_A_IO_L2_in_0,
        /* fifo */ fifo_A_A_IO_L2_in_1,
        /* fifo */ fifo_A_PE_0_0
      );
      /* Module Call */

      /* Module Call */
      A_IO_L2_in_boundary_inst_1.run(
        /* module id */ 1,
        /* fifo */ fifo_A_A_IO_L2_in_1,
        /* fifo */ fifo_A_PE_1_0
      );
      /* Module Call */

      /* Module Call */
      B_IO_L3_in_serialize_inst.run(
        /* array */ B,
        /* fifo */ fifo_B_B_IO_L3_in_serialize
      );
      /* Module Call */

      /* Module Call */
      B_IO_L3_in_inst.run(
        /* fifo */ fifo_B_B_IO_L3_in_serialize,
        /* fifo */ fifo_B_B_IO_L2_in_0
      );
      /* Module Call */

      /* Module Call */
      B_IO_L2_in_inst_0.run(
        /* module id */ 0,
        /* fifo */ fifo_B_B_IO_L2_in_0,
        /* fifo */ fifo_B_B_IO_L2_in_1,
        /* fifo */ fifo_B_PE_0_0
      );
      /* Module Call */

      /* Module Call */
      B_IO_L2_in_boundary_inst_1.run(
        /* module id */ 1,
        /* fifo */ fifo_B_B_IO_L2_in_1,
        /* fifo */ fifo_B_PE_0_1
      );
      /* Module Call */

      /* Module Call */
      PE_inst_0_0.run(
        /* module id */ 0,
        /* module id */ 0,
        /* fifo */ fifo_A_PE_0_0,
        /* fifo */ fifo_A_PE_0_1,
        /* fifo */ fifo_B_PE_0_0,
        /* fifo */ fifo_B_PE_1_0,
        /* fifo */ fifo_C_drain_PE_0_0
      );
      /* Module Call */

      /* Module Call */
      PE_inst_0_1.run(
        /* module id */ 0,
        /* module id */ 1,
        /* fifo */ fifo_A_PE_0_1,
        /* fifo */ fifo_A_PE_0_2,
        /* fifo */ fifo_B_PE_0_1,
        /* fifo */ fifo_B_PE_1_1,
        /* fifo */ fifo_C_drain_PE_0_1
      );
      /* Module Call */

      /* Module Call */
      PE_inst_1_0.run(
        /* module id */ 1,
        /* module id */ 0,
        /* fifo */ fifo_A_PE_1_0,
        /* fifo */ fifo_A_PE_1_1,
        /* fifo */ fifo_B_PE_1_0,
        /* fifo */ fifo_B_PE_2_0,
        /* fifo */ fifo_C_drain_PE_1_0
      );
      /* Module Call */

      /* Module Call */
      PE_inst_1_1.run(
        /* module id */ 1,
        /* module id */ 1,
        /* fifo */ fifo_A_PE_1_1,
        /* fifo */ fifo_A_PE_1_2,
        /* fifo */ fifo_B_PE_1_1,
        /* fifo */ fifo_B_PE_2_1,
        /* fifo */ fifo_C_drain_PE_1_1
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L1_out_boundary_inst_0_1.run(
        /* module id */ 0,
        /* module id */ 1,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_1,
        /* fifo */ fifo_C_drain_PE_1_0
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L1_out_inst_0_0.run(
        /* module id */ 0,
        /* module id */ 0,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_1,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_0,
        /* fifo */ fifo_C_drain_PE_0_0
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L1_out_boundary_inst_1_1.run(
        /* module id */ 1,
        /* module id */ 1,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_1,
        /* fifo */ fifo_C_drain_PE_1_1
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L1_out_inst_1_0.run(
        /* module id */ 1,
        /* module id */ 0,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_1,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_0,
        /* fifo */ fifo_C_drain_PE_0_1
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L2_out_boundary_inst_1.run(
        /* module id */ 1,
        /* fifo */ fifo_C_drain_C_drain_IO_L2_out_1,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_1_0
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L2_out_inst_0.run(
        /* module id */ 0,
        /* fifo */ fifo_C_drain_C_drain_IO_L2_out_1,
        /* fifo */ fifo_C_drain_C_drain_IO_L2_out_0,
        /* fifo */ fifo_C_drain_C_drain_IO_L1_out_0_0
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L3_out_inst.run(
        /* fifo */ fifo_C_drain_C_drain_IO_L3_out_serialize,
        /* fifo */ fifo_C_drain_C_drain_IO_L2_out_0
      );
      /* Module Call */

      /* Module Call */
      C_drain_IO_L3_out_serialize_inst.run(
        /* array */ C,
        /* fifo */ fifo_C_drain_C_drain_IO_L3_out_serialize
      );
      /* Module Call */

    }

  private:
    /* Module Declaration */
    A_IO_L3_in_serialize A_IO_L3_in_serialize_inst;
    A_IO_L3_in A_IO_L3_in_inst;
    A_IO_L2_in A_IO_L2_in_inst_0;
    A_IO_L2_in_boundary A_IO_L2_in_boundary_inst_1;
    B_IO_L3_in_serialize B_IO_L3_in_serialize_inst;
    B_IO_L3_in B_IO_L3_in_inst;
    B_IO_L2_in B_IO_L2_in_inst_0;
    B_IO_L2_in_boundary B_IO_L2_in_boundary_inst_1;
    PE PE_inst_0_0;
    PE PE_inst_0_1;
    PE PE_inst_1_0;
    PE PE_inst_1_1;
    C_drain_IO_L1_out C_drain_IO_L1_out_inst_0_0;
    C_drain_IO_L1_out_boundary C_drain_IO_L1_out_boundary_inst_0_1;
    C_drain_IO_L1_out C_drain_IO_L1_out_inst_1_0;
    C_drain_IO_L1_out_boundary C_drain_IO_L1_out_boundary_inst_1_1;
    C_drain_IO_L2_out C_drain_IO_L2_out_inst_0;
    C_drain_IO_L2_out_boundary C_drain_IO_L2_out_boundary_inst_1;
    C_drain_IO_L3_out C_drain_IO_L3_out_inst;
    C_drain_IO_L3_out_serialize C_drain_IO_L3_out_serialize_inst;
    /* Module Declaration */

    /* FIFO Declaration */
    /* A_IO_L3_in_serialize fifo */ ac_channel<A_t8> fifo_A_A_IO_L3_in_serialize;
    /* B_IO_L3_in_serialize fifo */ ac_channel<B_t8> fifo_B_B_IO_L3_in_serialize;
    /* C_drain_IO_L3_out_serialize fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L3_out_serialize;
    /* A_IO_L2_in fifo */ ac_channel<A_t8> fifo_A_A_IO_L2_in_0;
    /* A_IO_L2_in fifo */ ac_channel<A_t8> fifo_A_A_IO_L2_in_1;
    /* A_IO_L2_in fifo */ ac_channel<A_t8> fifo_A_A_IO_L2_in_2;
    /* B_IO_L2_in fifo */ ac_channel<B_t8> fifo_B_B_IO_L2_in_0;
    /* B_IO_L2_in fifo */ ac_channel<B_t8> fifo_B_B_IO_L2_in_1;
    /* B_IO_L2_in fifo */ ac_channel<B_t8> fifo_B_B_IO_L2_in_2;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_0_0;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_0_1;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_0_2;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_1_0;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_1_1;
    /* PE fifo */ ac_channel<A_t2> fifo_A_PE_1_2;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_0_0;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_1_0;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_2_0;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_0_1;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_1_1;
    /* PE fifo */ ac_channel<B_t2> fifo_B_PE_2_1;
    /* PE fifo */ ac_channel<C_t1> fifo_C_drain_PE_0_0;
    /* PE fifo */ ac_channel<C_t1> fifo_C_drain_PE_1_0;
    /* PE fifo */ ac_channel<C_t1> fifo_C_drain_PE_0_1;
    /* PE fifo */ ac_channel<C_t1> fifo_C_drain_PE_1_1;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_0_0;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_0_1;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_0_2;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_1_0;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_1_1;
    /* C_drain_IO_L1_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L1_out_1_2;
    /* C_drain_IO_L2_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L2_out_0;
    /* C_drain_IO_L2_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L2_out_1;
    /* C_drain_IO_L2_out fifo */ ac_channel<C_t2> fifo_C_drain_C_drain_IO_L2_out_2;
    /* FIFO Declaration */
};


================================================
FILE: autosa_tests/mm_catapult/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_getting_started/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/mm_getting_started/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/mm_getting_started/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm_getting_started/kernel.c
================================================
// Uncomment the macro below to apply the layout transformation on array B to enable SIMD vectorization
#define LAYOUT_TRANSFORM

#include "kernel.h"

int main(int argc, char **argv) {
#ifndef LAYOUT_TRANSFORM  
  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
#else  
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];
#endif

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = (data_t)rand() / RAND_MAX;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
#ifndef LAYOUT_TRANSFORM      
      B[k][j] = (data_t)rand() / RAND_MAX;
#else      
      B[j][k] = (data_t)rand() / RAND_MAX;
#endif      
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifndef LAYOUT_TRANSFORM        
        C[i][j] = C[i][j] + A[i][k] * B[k][j];
#else        
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
#endif        
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
#ifndef LAYOUT_TRANSFORM        
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[k][j];
#else
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
#endif        
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_getting_started/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_getting_started/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_hbm/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
PLATFORM := xilinx_u280_xdma_201920_3

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/mm_hbm/README.md
================================================
# Matrix Multiplication (HBM)

This is an example of small-size matrix multiplication using high-bandwidth memory (HBM).

Board        | Software Version
-------------|-----------------
Xilinx Alveo U280 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/mm_hbm/kernel.c
autosa_tests/mm_hbm/kernel.h
autosa_tests/mm_hbm/simd_info.json
autosa_tests/mm_hbm/Makefile
autosa_tests/mm_hbm/connectivity.cfg
```

__Command__:
```c
./autosa ./autosa_tests/mm_hbm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2];kernel[]->hbm_A[2];kernel[]->hbm_B[2];kernel[]->hbm_C_drain[2]}" --simd-info=./autosa_tests/mm_hbm/simd_info.json --hbm
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm_hbm/Makefile autosa.tmp/output/
cp autosa_tests/mm_hbm/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
```

================================================
FILE: autosa_tests/mm_hbm/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A_0:HBM[0]
sp=kernel0_1.A_1:HBM[1]
sp=kernel0_1.B_0:HBM[2] 
sp=kernel0_1.B_1:HBM[3] 
sp=kernel0_1.C_0:HBM[4]
sp=kernel0_1.C_1:HBM[5]


================================================
FILE: autosa_tests/mm_hbm/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm_hbm/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = k;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = k;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_hbm/kernel.h
================================================
#include "stdio.h"
#include "stdlib.h"
#include "math.h"

typedef float data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_hbm/simd_info.json
================================================
{
  "kernel3": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_hcl/README.md
================================================
# Matrix Multiplication (Small)

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/mm_hcl/kernel.c
autosa_tests/mm_hcl/kernel.h
autosa_tests/mm_hcl/simd_info.json
autosa_tests/mm_hcl/hls_script.tcl
```

__Command__:
This is an internal test example for HeteroCL integration.

## Transposition

First, HeteroCL might provide AutoSA with transposed input matrices. We consider four test cases here.

1. A_B: Both input matrices A and B keep the row major.

Set `TRANS` to `A_B` in `kernel.c`.
Use the following command to compile the program.
```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl
```

The generated files can be found under `autosa.tmp/output`.
You may verify the design using Xilinx HLS.

```bash
cp ./autosa_tests/mm_hcl/hls_script.tcl ./autosa.tmp/output/
cd ./autosa.tmp/output
vivado_hls -f hls_script.tcl
```

You may notice here that we didn't use SIMD vectorization. The reason is that by default AutoSA will only examine the time loops (loops not mapped to the PE dimensions, aka, space loops). In this example, only loop k is available.
However, with the default layout `A[i][k]`, `B[k][j]`, and `C[i][j]`, as k is not the last-varying dimension of matrix B, it can't be used for vectorization.

To enable vectorization, we could enable AutoSA to use space loops as candidates as well. In this example, loop j can be used for vectorization.
Note that loop j is invariant to `A[i][k]` and leads to stride-one access for `B[k][j]`. However, before using this loop as the vectorization loop, we have to 
turn off the latency hiding optimization on loop j. The reason is that 
the loop j is tiled for latency hiding before vectorization, the remaining tiled loop is no longer consecutive as it is now mapped to hyper tiles. And therefore, the array access `B[k][j]` is no longer coalesced under this loop and 
SIMD vectorization opportunity is lost. 

To make use of SIMD vectorization, use the following command.
```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,1]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--simd-touch-space
```

We add `--simd-touch-space` to consider space loops as well for vectorization. To use loop j for vectorization, we set latency tiling factors to `[8,1]` which means that only loop i is tiled for latency hiding. AutoSA will dump out the possible loops for SIMD vectorization. Take a look at the file `tuning.json` under the directory `autosa.tmp/output`

```json
"simd": {
    "tilable_loops": [16,16],
    "scores": [13,13],
    "legal": [1,0]
}
```

AutoSA identifies two candidate loops. The first loop is the loop j, and the second loop is the loop k.
However, layout transformation is required for loop k.
Therefore, the `legal` value is set to 0 for the second loop.

Now to apply SIMD vectorization, use the following command.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,1];kernel[]->simd[8,1]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--simd-touch-space
```

A complete design with loop j vectorized is generated now.

2. AT_B: The input matrix A is transposed to column major, and the matrix B keeps the column major.

Set `TRANS` to `AT_B` in `kernel.c`.
Use the following command to compile the program.
```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl
```

To enable SIMD vectorization, let's take a look at the array accesses `A[k][i]`, `B[k][j]`, and `C[i][j]`.
In this case, loop j can be used for vectorization as long as it is avoided during the latency hiding. 
Use the following command to only tile loop i for latency hiding.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,1]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--simd-touch-space
```

Similarly you may check `tuning.json` for more detailed information. Finally use the command below to generated a vectorized design.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,1];kernel[]->simd[8,1]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--simd-touch-space
```

3. A_BT: The input matrix A remains the row major, and the matrix B is transposed to column major.

Set `TRANS` to `A_BT` in `kernel.c`.
Run the following command first.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl
```

In this case, AutoSA already detects a SIMD candidate loop k and will stop.
Array accesses in the current layout are `A[i][k]`, `B[j][k]`, and `C[i][j]`. Therefore, loop k can be used as the SIMD loop. Let's set the SIMD factor to 8 by using the following command to generate a complete design.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl
```

4. AT_BT: Both matrix A and B are transposed to column major.

Set `TRANS` to `AT_BT` in `kernel.c`.

Run the following command first.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl
```

An unvectorized design is generated.
Array accesses in the current layout are `A[k][i]`, `B[j][k]`, and `C[i][j]`. In this case, none of the loops can be used for vectorization.

In conclusion, when matrix A and B are supplied to AutoSA with different layouts, there are different rules to consider to enable full optimization (specifically, SIMD vectorization). We summarize these rules below.

| Layout |     Latency Hiding     |         SIMD        |  Compilation Flag  |
|:------:|:----------------------:|:-------------------:|:------------------:|
|   A_B  | kernel[]->latency[X,1] | kernel[]->simd[X,1] | --simd-touch-space |
|  AT_B  | kernel[]->latency[X,1] | kernel[]->simd[X,1] | --simd-touch-space |
|  A_BT  | kernel[]->latency[X,X] |  kernel[]->simd[X]  |                    |
|  AT_BT | kernel[]->latency[X,X] |         N/A         |                    |

## Data Packing

In additional to transposition, HeteroCL could also supply AutoSA with pre-packed array. 
By default, AutoSA will try to pack data as much as possible for each array to improve the effective DRAM bandwidth.
The data packing factors can be restrained by using the argument `--data-pack-sizes`. 
For each array, AutoSA allows users to restrain the data packing factors at three levels:

- Innermost level: Data packing factors for L1 I/O modules.
- Outermost level: Data packing factors for I/O modules accessing the DRAM.
- Intermediate level: Data packing factors for I/O modules except L1 or outermost I/O modules.

To restrain any data packing factors in the program. Specify it using the following format.

```bash
--data-pack-sizes="{kernel[]->A[8,32,64]}"
```

Using the above commands, we retrain the innermost level data packing factors to be no greater than 8 bytes (64 bits), 
the intermediate level to be no greater than 32 bytes (256 bits), and the outermost level to be no greater than 64 bytes (512 bits).
Due to the limitation of Xilinx devices, we require the outermost data packing factors to be no greater than 512 bits. 
In addition, as a rule of thumb, we recommend to limit the intermediate level no greater than 256 bits to restrain the FIFO overheads.

Set `TRANS` to `A_BT` in `kernel.c`.
Use the following command to compile the design.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--data-pack-sizes="{kernel[]->A[8,32,64];kernel[]->B[8,32,64];kernel[]->C[8,32,64]}"
```

Now let's take a look at the generated code.
At the top-level function `void autosa_func(A_t16 *A, B_t16 *B, C_t8 *C)`, we have array A packed with 16 elements (512 bits), array B packed with 16 elements (512 bits), and array C packed with 8 elements (256 bits). Although we have specified the maximal outermost packing factor to be 512 bits for each array, only array A and B achieved the maximal packing factor.

For array `C[I][J]`, as we partitoned the whole systolic array with factors `[16,16,16]`, each time the systolic array computes a tile of `C[16][16]`. Furthermore, as this tile is partitioned to be computed in a `2x2` array, each PE generates a sub-tile of `C[8][8]`. Therefore, when draining out the results, we transfer out the data in the size of sub-tile `C[8][8]`. The maximal data packing factor that we can achieve is 8.

If programmers hope to have a larger data packing factor for array C as well, there are two options to consider:

- Use host data serialization. 
- Partition a larger tile inside each PE.

Host serialization requires layout transformation on the host side which makes it difficult to integrate with the existing HeteroCL environment.

The command below shows an example of using a larger latency hiding factor to allocate a larger tile inside each PE.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,32,16];kernel[]->latency[8,16];kernel[]->simd[8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--data-pack-sizes="{kernel[]->A[8,32,64];kernel[]->B[8,32,64];kernel[]->C[8,32,64]}"
```

You can check the generated the source code and find that we have successuflly packed all arrays to 16 elements each.

The last thing to mention is that in the current flow we prioritize SIMD vectorization factors to user-specified data packing factors. In this example, as we specify the SIMD factor to be 8, array A and B will be packed with 8 elements at least. As an example, if running the following commmand which tries to restrain the data packing factors of A and B to 4 elements (16 bytes), AutoSA will ignore this constraint and pack A and B with 8 elements, only array C will be packed with 4 elements.

```bash
./autosa ./autosa_tests/mm_hcl/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_hls_c \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,32,16];kernel[]->latency[8,16];kernel[]->simd[8]}" \
--simd-info=./autosa_tests/mm_hcl/simd_info.json \
--hls \
--hcl \
--data-pack-sizes="{kernel[]->A[8,16,16];kernel[]->B[8,16,16];kernel[]->C[8,16,16]}"
```

================================================
FILE: autosa_tests/mm_hcl/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm_hcl/kernel.c
================================================
#include "kernel.h"

#define A_B 0
#define AT_B 1
#define A_BT 2
#define AT_BT 3
#define TRANS A_BT

int main(int argc, char **argv) {
#if TRANS == A_B 
  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
#elif TRANS == AT_B
  data_t A[K][I], B[K][J], C[I][J], C_golden[I][J];
#elif TRANS == A_BT
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];
#elif TRANS == AT_BT
  data_t A[K][I], B[J][K], C[I][J], C_golden[I][J];
#endif

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
#if TRANS == A_B 
      A[i][k] = (data_t)rand() / RAND_MAX;
#elif TRANS == A_BT
      A[i][k] = (data_t)rand() / RAND_MAX;
#elif TRANS == AT_B
      A[k][i] = (data_t)rand() / RAND_MAX;
#elif TRANS == AT_BT
      A[k][i] = (data_t)rand() / RAND_MAX;
#endif
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
#if TRANS == A_B
      B[k][j] = (data_t)rand() / RAND_MAX;
#elif TRANS == A_BT
      B[j][k] = (data_t)rand() / RAND_MAX;
#elif TRANS == AT_B
      B[k][j] = (data_t)rand() / RAND_MAX;
#elif TRANS == AT_BT
      B[j][k] = (data_t)rand() / RAND_MAX;
#endif     
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
#if TRANS == A_B
        C[i][j] = C[i][j] + A[i][k] * B[k][j];
#elif TRANS == A_BT
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
#elif TRANS == AT_B
        C[i][j] = C[i][j] + A[k][i] * B[k][j];
#elif TRANS == AT_BT
        C[i][j] = C[i][j] + A[k][i] * B[j][k];
#endif          
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
#if TRANS == A_B
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[k][j];
#elif TRANS == A_BT
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
#elif TRANS == AT_B
        C_golden[i][j] = C_golden[i][j] + A[k][i] * B[k][j];
#elif TRANS == AT_BT
        C_golden[i][j] = C_golden[i][j] + A[k][i] * B[j][k];
#endif          
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}

//#include <stdio.h>
//int main(int argc, char **argv) {
//
//      float L2[1][10];
//      float FL[1][64];
//      float w2[64][10];
//#pragma scop
//      for (int j1 = 0; j1 < 10; ++j1) {
//        L2[0][j1] = 0.000000e+00f;
//        for (int k1 = 0; k1 < 64; ++k1) {
//          L2[0][j1] = (L2[0][j1] + (FL[0][k1] * w2[k1][j1]));
//        }
//      }
//#pragma endscop
//      printf("%f", L2[0][0]);
//      printf("%f", FL[0][0]);
//      printf("%f", w2[0][0]);
//}

================================================
FILE: autosa_tests/mm_hcl/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_hcl/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_hcl_intel/Makefile
================================================
APP ?= kernel
AOCL_BOARD ?= s10mx_hbm_es
SW_EMU_AOCX ?= $(APP)_sw_emu.aocx
HW_EMU_AOCX ?= $(APP)_hw_emu.aocx
HW_AOCX ?= $(APP)_hw.aocx
AOCO ?= $(APP).aoco
AOCR ?= $(APP).aocr

# Compiler
AOC ?= aoc
CXX ?= g++
AOC_FLAGS ?= -board=$(AOCL_BOARD) -fp-relaxed -report -hyper-optimized-handshaking=off -I $(INTELFPGAOCLSDKROOT)/include/kernel_headers

TARGET ?= host
SW_EMU_TARGET ?= host_sw_emu
TARGET_DIR ?= bin
AOCL_UTILS ?= $(INTELFPGAOCLSDKROOT)/examples_aoc/common

# Directories
INC_DIRS := src $(AOCL_UTILS)/inc
LIB_DIRS := 

# Files
INCS := $(wildcard src/*.h)
HOST_SRCS := $(wildcard src/$(APP)_host.cpp $(AOCL_UTILS)/src/AOCLUtils/*.cpp)
KERNEL_SRCS := src/$(APP)_kernel.cl

ifeq ($(VERBOSE),1)
ECHO := 
else
ECHO := @
endif

# Where is the Intel(R) FPGA SDK for OpenCL(TM) software?
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation)
endif
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)/host/include/CL/opencl.h),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation.)
endif

# OpenCL compile and link flags.
AOCL_COMPILE_CONFIG := $(shell aocl compile-config )
AOCL_LINK_LIBS := $(shell aocl ldlibs )
AOCL_LINK_FLAGS := $(shell aocl ldflags )
# Linking with defences enabled
AOCL_LINK_FLAGS += -z noexecstack
AOCL_LINK_FLAGS += -Wl,-z,relro,-z,now
AOCL_LINK_FLAGS += -Wl,-Bsymbolic
AOCL_LINK_FLAGS += -pie
AOCL_LINK_CONFIG := $(AOCL_LINK_FLAGS) $(AOCL_LINK_LIBS)

# Compilation flags
ifeq ($(DEBUG),1)
CXXFLAGS += -g
else
CXXFLAGS += -O2
endif
CXXFLAGS += -std=gnu++0x

# Compiling with defences enabled
CXXFLAGS += -fstack-protector
CXXFLAGS += -D_FORTIFY_SOURCE=2
CXXFLAGS += -Wformat -Wformat-security
CXXFLAGS += -fPIE

# We must force GCC to never assume that it can shove in its own
# sse2/sse3 versions of strlen and strcmp because they will CRASH.
# Very hard to debug!
CXXFLAGS += -fPIC

LIBS := rt pthread

## Make it all!
#all : $(TARGET_DIR)/$(TARGET)

sw_emu : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)

hls: $(TARGET_DIR)/$(AOCR)

hw : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)

hw_emu: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)

hw_emu_check: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)
	CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(HW_EMU_AOCX)

sw_emu_check : $(TARGET_DIR)/$(SW_EMU_TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)
	CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(SW_EMU_AOCX)

hw_check : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)
	$(TARGET_DIR)/$(TARGET) $(HW_AOCX)

# Host executable target.
$(TARGET_DIR)/$(TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET)

$(TARGET_DIR)/$(SW_EMU_TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET) -DEMULATE

$(TARGET_DIR) :
	$(ECHO)mkdir $(TARGET_DIR)

$(TARGET_DIR)/$(SW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=emulator -legacy-emulator -o $@ $^

$(TARGET_DIR)/$(HW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=simulator -ghdl -o $@ $^

$(TARGET_DIR)/$(HW_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -o $@ $^

$(TARGET_DIR)/$(AOCO) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -c -o $@ $^

$(TARGET_DIR)/$(AOCR) : $(TARGET_DIR)/$(AOCO)
	$(AOC) $(AOC_FLAGS) -rtl -o $@ $^

# Standard make targets
clean :
	$(ECHO)rm -rf $(TARGET_DIR)/*

.PHONY : all clean


================================================
FILE: autosa_tests/mm_hcl_intel/README.md
================================================
# Matrix Multiplication (Small)

Board        | Software Version
-------------|-----------------
Stratix 10 | Intel FPGA SDK for OpenCL 19.4

__Files__:
```
autosa_tests/mm_hcl_intel/kernel.c
autosa_tests/mm_hcl_intel/kernel.h
autosa_tests/mm_hcl_intel/simd_info.json
autosa_tests/mm_hcl_intel/Makefile
```

__Command__:
This is an internal test example for HeteroCL integration.

## Example 1

```c
./autosa ./autosa_tests/mm_hcl_intel/kernel.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_opencl \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
--simd-info=./autosa_tests/mm_hcl_intel/simd_info.json \
--host-serialize \
--loop-infinitize \
--double-buffer-style=0 \
--mem-port-map="{kernel[]->A[0];kernel[]->B[1];kernel[]->C[2]}" \
--hcl
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/Makefile autosa.tmp/output/
```

Execute the makefile to perform software emulation
```
make sw_emu_check
```
or synthesize the design to RTL
```
make hls
```
or generate the bitstream
```
make hw
```

## Example 2

```c
./autosa ./autosa_tests/mm_hcl_intel/kernel2.c \
--config=./autosa_config/autosa_config.json \
--target=autosa_opencl \
--output-dir=./autosa.tmp/output \
--sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[32,32,512];kernel[]->
latency[8,8];kernel[]->simd[1]}" \
--simd-info=./autosa_tests/mm_hcl_intel/simd_info.json \
--host-serialize \
--loop-infinitize \
--double-buffer-style=0 \
--hcl
```

================================================
FILE: autosa_tests/mm_hcl_intel/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = k;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = k;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_hcl_intel/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_hcl_intel/kernel2.c
================================================
#include <stdio.h>
int main(int argc, char **argv) {
  static float Y0[1024][1024];
  static float A[1024][1024];
  static float B[1024][1024];

#pragma scop
  for (int i = 0; i < 1024; ++i) {
    for (int j = 0; j < 1024; ++j) {
      Y0[i][j] = 0.000000e+00f;
      for (int k = 0; k < 1024; ++k) {
        Y0[i][j] = (Y0[i][j] + (A[i][k] * B[j][k]));
      }
    }
  }
#pragma endscop

  printf("%f", Y0[0][0]);
  printf("%f", A[0][0]);
  printf("%f", B[0][0]);
}

================================================
FILE: autosa_tests/mm_hcl_intel/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  },
  "kernel5": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_int16/Makefile
================================================
VPP := $(XILINX_VITIS)/bin/v++
EMCONFIGUTIL := $(XILINX_VITIS)/bin/emconfigutil
MODE := hw
#PLATFORM := xilinx_u200_qdma_201920_1
PLATFORM := xilinx_u250_xdma_201830_2

# sources
KERNEL_SRC := src/kernel_kernel.cpp
HOST_SRC := src/kernel_host.cpp

# targets
HOST_EXE := host.exe

XOS := kernel0.$(MODE).xo
XCLBIN := kernel0.$(MODE).xclbin
EMCONFIG_FILE := emconfig.json

# Linker options to map kernel ports to DDR banks
VPP_LINK_OPTS := --config connectivity.cfg

VPP_COMMON_OPTS := -s -t $(MODE) --platform $(PLATFORM) -R2 -O3 --kernel_frequency 250 --vivado.prop=run.impl_1.STRATEGY=Performance_EarlyBlockPlacement
CFLAGS := -g -std=c++11 -I$(XILINX_XRT)/include
LFLAGS := -L$(XILINX_XRT)/lib -lxilinxopencl -lpthread -lrt
NUMDEVICES := 1

# run time args
EXE_OPT := kernel0.$(MODE).xclbin

# primary build targets
.PHONY: xclbin app all

xclbin:  $(XCLBIN)
app: $(HOST_EXE)

all: xclbin app

clean:
	-$(RM) $(EMCONFIG_FILE) $(HOST_EXE) $(XCLBIN) *.xclbin *.xo $(XOS)

# kernel rules
$(XOS): $(KERNEL_SRC)
	$(RM) $@
	$(VPP) $(VPP_COMMON_OPTS) -c -k kernel0 -o $@ $+


$(XCLBIN): $(XOS)
	$(VPP) $(VPP_COMMON_OPTS) -l -o $@ $+ $(VPP_LINK_OPTS)

# host rules
$(HOST_EXE): $(HOST_SRC)
	g++ $(CFLAGS) -o $@ $+ $(LFLAGS)
	@echo 'Compiled Host Executable: $(HOST_EXE)'

$(EMCONFIG_FILE):
	$(EMCONFIGUTIL) --nd $(NUMDEVICES) --od . --platform $(PLATFORM)

check: $(XCLBIN) $(HOST_EXE) $(EMCONFIG_FILE)
	XCL_EMULATION_MODE=${MODE} ./$(HOST_EXE) $(EXE_OPT)


================================================
FILE: autosa_tests/mm_int16/README.md
================================================
# Matrix Multiplication in int16 (Small) 

Board        | Software Version
-------------|-----------------
Xilinx Alveo U250 | Xilinx Vitis 2019.2

__Files__:
```
autosa_tests/mm_int16/kernel.c
autosa_tests/mm_int16/kernel.h
autosa_tests/mm_int16/simd_info.json
autosa_tests/mm_int16/Makefile
autosa_tests/mm_int16/connectivity.cfg
autosa_tests/mm_int16/hls_script.tcl
```

__Command__:
To run the HLS flow for C/RTL simulation
```bash
./autosa ./autosa_tests/mm_int16/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm_int16/simd_info.json --host-serialize --hls
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `hls_script.tcl` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm_int16/hls_script.tcl autosa.tmp/output/
```

Run the TCL script to build the HLS project.

```
cd autosa.tmp/output
vivado_hls -f hls_script.tcl
```

Alternatively, if you need to generate the bitstream for on-board testing, simply remove the `--hls` flag from the AutoSA command.
```bash
./autosa ./autosa_tests/mm_int16/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm_int16/simd_info.json --host-serialize
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` and `connectivity.cfg` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm_int16/Makefile autosa.tmp/output/
cp autosa_tests/mm_int16/connectivity.cfg autosa.tmp/output/
```

Execute the makefile to build the design.

```
cd autosa.tmp/output
make all
make check
```

================================================
FILE: autosa_tests/mm_int16/connectivity.cfg
================================================
[connectivity]
sp=kernel0_1.A:DDR[0]
sp=kernel0_1.B:DDR[1] 
sp=kernel0_1.C:DDR[2]


================================================
FILE: autosa_tests/mm_int16/hls_script.tcl
================================================
############################################################
## This file is generated automatically by Vivado HLS.
## Please DO NOT edit it.
## Copyright (C) 1986-2019 Xilinx, Inc. All Rights Reserved.
############################################################
open_project hls_prj
set_top kernel0
add_files src/kernel_kernel.h
add_files src/kernel_kernel.cpp
add_files -tb src/kernel_host.cpp
open_solution "solution1"
set_part {xcu200-fsgd2104-2-e}
create_clock -period 5 -name default
config_compile -name_max_length 50
#source "./prj/solution1/directives.tcl"
csim_design
#csynth_design
#cosim_design
#cosim_design -trace_level all
#cosim_design -setup -trace_level all
#export_design -format ip_catalog
exit


================================================
FILE: autosa_tests/mm_int16/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J]; // gemm0,3
//  data_t A[K][I], B[K][J], C[I][J], C_golden[I][J]; // gemm4

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = k;
//      A[k][i] = k;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = k;
//      B[k][j] = k;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
//        C[i][j] = C[i][j] + A[k][i] * B[k][j];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
//        C_golden[i][j] = C_golden[i][j] + A[k][i] * B[k][j];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_int16/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef unsigned short data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_int16/simd_info.json
================================================
{
  "kernel0": {
    "reduction": ["y"]
  },
  "kernel1": {
    "reduction": ["y"]
  },
  "kernel2": {
    "reduction": ["y"]
  }, 
  "kernel3": {
    "reduction": ["y"]
  },
  "kernel4": {
    "reduction": ["y"]
  }
}


================================================
FILE: autosa_tests/mm_intel/Makefile
================================================
APP ?= kernel
AOCL_BOARD ?= s10mx_hbm_es
SW_EMU_AOCX ?= $(APP)_sw_emu.aocx
HW_EMU_AOCX ?= $(APP)_hw_emu.aocx
HW_AOCX ?= $(APP)_hw.aocx
AOCO ?= $(APP).aoco
AOCR ?= $(APP).aocr

# Compiler
AOC ?= aoc
CXX ?= g++
AOC_FLAGS ?= -board=$(AOCL_BOARD) -fp-relaxed -report -hyper-optimized-handshaking=off -I $(INTELFPGAOCLSDKROOT)/include/kernel_headers

TARGET ?= host
SW_EMU_TARGET ?= host_sw_emu
TARGET_DIR ?= bin
AOCL_UTILS ?= $(INTELFPGAOCLSDKROOT)/examples_aoc/common

# Directories
INC_DIRS := src $(AOCL_UTILS)/inc
LIB_DIRS := 

# Files
INCS := $(wildcard src/*.h)
HOST_SRCS := $(wildcard src/$(APP)_host.cpp $(AOCL_UTILS)/src/AOCLUtils/*.cpp)
KERNEL_SRCS := src/$(APP)_kernel.cl

ifeq ($(VERBOSE),1)
ECHO := 
else
ECHO := @
endif

# Where is the Intel(R) FPGA SDK for OpenCL(TM) software?
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation)
endif
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)/host/include/CL/opencl.h),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation.)
endif

# OpenCL compile and link flags.
AOCL_COMPILE_CONFIG := $(shell aocl compile-config )
AOCL_LINK_LIBS := $(shell aocl ldlibs )
AOCL_LINK_FLAGS := $(shell aocl ldflags )
# Linking with defences enabled
AOCL_LINK_FLAGS += -z noexecstack
AOCL_LINK_FLAGS += -Wl,-z,relro,-z,now
AOCL_LINK_FLAGS += -Wl,-Bsymbolic
AOCL_LINK_FLAGS += -pie
AOCL_LINK_CONFIG := $(AOCL_LINK_FLAGS) $(AOCL_LINK_LIBS)

# Compilation flags
ifeq ($(DEBUG),1)
CXXFLAGS += -g
else
CXXFLAGS += -O2
endif
CXXFLAGS += -std=gnu++0x

# Compiling with defences enabled
CXXFLAGS += -fstack-protector
CXXFLAGS += -D_FORTIFY_SOURCE=2
CXXFLAGS += -Wformat -Wformat-security
CXXFLAGS += -fPIE

# We must force GCC to never assume that it can shove in its own
# sse2/sse3 versions of strlen and strcmp because they will CRASH.
# Very hard to debug!
CXXFLAGS += -fPIC

LIBS := rt pthread

## Make it all!
#all : $(TARGET_DIR)/$(TARGET)

sw_emu : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)

hls: $(TARGET_DIR)/$(AOCR)

hw : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)

hw_emu: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)

hw_emu_check: $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_EMU_AOCX)
	CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(HW_EMU_AOCX)

sw_emu_check : $(TARGET_DIR)/$(SW_EMU_TARGET) $(TARGET_DIR)/$(SW_EMU_AOCX)
	CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 $(TARGET_DIR)/$(TARGET) $(SW_EMU_AOCX)

hw_check : $(TARGET_DIR)/$(TARGET) $(TARGET_DIR)/$(HW_AOCX)
	$(TARGET_DIR)/$(TARGET) $(HW_AOCX)

# Host executable target.
$(TARGET_DIR)/$(TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET)

$(TARGET_DIR)/$(SW_EMU_TARGET) : Makefile $(HOST_SRCS) $(INCS) $(TARGET_DIR)
	$(ECHO)$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(EXTRACXXFLAGS) -fPIC $(foreach D,$(INC_DIRS),-I$D) \
			$(AOCL_COMPILE_CONFIG) $(HOST_SRCS) $(AOCL_LINK_CONFIG) \
			$(foreach D,$(LIB_DIRS),-L$D) \
			$(foreach L,$(LIBS),-l$L) \
			-o $(TARGET_DIR)/$(TARGET) -DEMULATE

$(TARGET_DIR) :
	$(ECHO)mkdir $(TARGET_DIR)

$(TARGET_DIR)/$(SW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=emulator -legacy-emulator -o $@ $^

$(TARGET_DIR)/$(HW_EMU_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -march=simulator -ghdl -o $@ $^

$(TARGET_DIR)/$(HW_AOCX) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -o $@ $^

$(TARGET_DIR)/$(AOCO) : $(KERNEL_SRCS)
	$(AOC) $(AOC_FLAGS) -c -o $@ $^

$(TARGET_DIR)/$(AOCR) : $(TARGET_DIR)/$(AOCO)
	$(AOC) $(AOC_FLAGS) -rtl -o $@ $^

# Standard make targets
clean :
	$(ECHO)rm -rf $(TARGET_DIR)/*

.PHONY : all clean


================================================
FILE: autosa_tests/mm_intel/README.md
================================================
# Matrix Multiplication (Small)

Board        | Software Version
-------------|-----------------
Stratix 10 | Intel FPGA SDK for OpenCL 19.4

__Files__:
```
autosa_tests/mm_intel/kernel.c
autosa_tests/mm_intel/kernel.h
autosa_tests/mm_intel/simd_info.json
autosa_tests/mm_intel/Makefile
```

__Command__:
```c
./autosa ./autosa_tests/mm_intel/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_opencl --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->array_part_L2[2,2,2];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm_intel/simd_info.json --host-serialize --loop-infinitize --double-buffer-style=0 --mem-port-map="{kernel[]->A[0];kernel[]->B[1];kernel[]->C[2]}"
```

After compilation, you will find all generated files under the directory `autosa.tmp/output/src`. Copy the `Makefile` to the directory `autosa.tmp/output`.

```
cp autosa_tests/mm/Makefile autosa.tmp/output/
```

Execute the makefile to perform software emulation
```
make sw_emu_check
```
or synthesize the design to RTL
```
make hls
```
or generate the bitstream
```
make hw
```


================================================
FILE: autosa_tests/mm_intel/kernel.c
================================================
#include "kernel.h"

int main(int argc, char **argv) {
//  data_t A[I][K], B[K][J], C[I][J], C_golden[I][J]; 
  data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

  for (int i = 0; i < I; i++) 
    for (int k = 0; k < K; k++) {
      A[i][k] = k;
    }

  for (int j = 0; j < J; j++)
    for (int k = 0; k < K; k++) {
      B[j][k] = k;
    }

#pragma scop
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    }
#pragma endscop

  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      C_golden[i][j] = 0;
      for (int k = 0; k < K; k++) {
        C_golden[i][j] = C_golden[i][j] + A[i][k] * B[j][k];
      }
    }

  int err = 0;
  for (int i = 0; i < I; i++)
    for (int j = 0; j < J; j++) {
      if (fabs((float)C_golden[i][j] - (float)C[i][j]) > 0.001)
        err++;
    }

  if (err)
    printf("Failed with %d errors!\n", err);
  else
    printf("Passed!\n");

  return 0;
}


================================================
FILE: autosa_tests/mm_intel/kernel.h
================================================
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef float data_t;
#define I 64
#define J 64
#define K 64


================================================
FILE: autosa_tests/mm_intel/simd_info.json
================================================
{
  "kernel3": {
    "reduction": ["y"]
  }
}


================================================
FILE: clean.sh
================================================
#!/bin/sh
rm ./autosa
rm -rf ./autosa.tmp
cd src
make clean
cd -


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))

import sphinx_rtd_theme

# -- Project information -----------------------------------------------------

project = 'AutoSA'
copyright = '2021, Jie Wang'
author = 'Jie Wang'

# The full version, including alpha/beta/rc tags
release = '0.01'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
        "sphinx_rtd_theme"
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']


================================================
FILE: docs/docker_image.rst
================================================
.. _docker-image-label:

Docker Image
============

We provide a docker image to quickly try out the features of AutoSA.

Pull the Docker image using the following command.

.. code:: bash
    
    docker pull whbldhwj/autosa:latest

================================================
FILE: docs/examples/cnn.rst
================================================
Convolutional Neural Network (Single Layer, Small)
==================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of small-size CNN. 
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/cnn``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[8,8,4,8];kernel[]->latency[4,2,4];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --no-reverse-order \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/cnn/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.    

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[8,8,4,8];kernel[]->latency[4,2,4];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --no-reverse-order

Now instead of HLS host code, an OpenCL host code is generated.    

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/cnn/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/cnn/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u250_xdma_201830_2``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the DDR bank mapping for the design. 
By default, we map pointers A, B, C to DDR bank 0, 1, 2.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

.. note:: 

    When using Vitis flow to perform RTL simulation, nothing needs to change in the source code.
    You may directly set the ``MODE`` to ``hw_emu`` and perform RTL simulation.
    However, by default, we will run the kernel 10 times to collect the average runtime.
    This may significantly prolong the simulation time. Consider reducing the kernel
    launching times to 1 before using RTL simulation.

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

It will take a few hours to finish. After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

Dataflow Exploration
--------------------

Simialar to GEMM example, we provide a more detailed discussion of different 
dataflows for this application generated by AutoSA. T
The parameters used in this program include: 

* `o`, `i`: output/input channel
* `r`, `c`: output image row/column
* `p`, `q`: kernel height/width

Array 1: [o]
^^^^^^^^^^^^

This is an output-stationary array that chooses the loop o as the space loop.
The input feature map cin is reused across PEs, weights w are sent directly to each PE.
Data are computed locally and drained out from each PE.

.. image:: images/cnn0_array.png
    :width: 300
    :align: center

Here is an example command for this design.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[8,4,4,8];kernel[]->latency[4,2,4];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls    

Array 2: [r]
^^^^^^^^^^^^

This is an output-stationary array that chooses the loop r as the space loop.
The wights w is reused across PEs, input feature maps cin are sent directly to each PE.
Data are computed locally and drained out from each PE.

.. image:: images/cnn1_array.png
    :width: 300
    :align: center

Here is an example command for this design.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[1];kernel[]->array_part[4,8,4,8];kernel[]->latency[2,4,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls        

Array 3: [c]
^^^^^^^^^^^^

This is an output-stationary array that chooses the loop c as the space loop.
The weights and input feature maps are sent directly to each PE.
Data are computed locally and drained out from each PE.

.. image:: images/cnn2_array.png
    :width: 300
    :align: center

Here is an example command for this design.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,8,4,8];kernel[]->latency[2,4,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls

In this design, weights are sent directly to each PE. This is due to the reason that 
AutoSA uses the data reuse along the r-axis of the weight access. 
As can be found in the printed compilation information on the screen, there are two reuse 
vector candidates for the weight access ``w[o][i][p][q]``.

.. image:: images/cnn_w_reuse.png
    :width: 800
    :align: center

By default, AutoSA chose the first candidate that reuse the data along the r-axis.
You may alter this choice by supplying the argument ``--select-rar-dep="{kernel[]->__pet_ref_4[1]}"``.
Here, we instruct AutoSA to select the candidate 1 for the array reference ``__pet_ref_4``.
``__pet_ref_4`` is the unique ID the polyhedral front-end assigned to this reference.
Using the following command, we could generate a different array that reuses the 
weights across PEs.

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,8,4,8];kernel[]->latency[2,4,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --select-rar-dep="{kernel[]->__pet_ref_4[1]}"

.. image:: images/cnn2_2_array.png
    :width: 300
    :align: center    

Array 4: [i]
^^^^^^^^^^^^    

This is an input-stationary array that chooses the loop i as the space loop.
The weights and input feature maps are sent directly to each PE.
Data are accumulated across PEs.

.. image:: images/cnn3_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[4,8,4,4];kernel[]->latency[2,2,2];kernel[]->simd[1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space

Array 5: [o,r]
^^^^^^^^^^^^^^

This is an output-stationary array that chooses the loop o and r as the space loops.
The weights are reused horizontally, and the input feature maps are reused vertically.

.. image:: images/cnn4_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[8,4,4,8];kernel[]->latency[4,2,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls

Array 6: [o,c]
^^^^^^^^^^^^^^    

This array is similar to array 5.
We could also add the additional argument as array 3 
to choose a better reuse vector for weights to exploit more data reuse.

.. image:: images/cnn5_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[5];kernel[]->array_part[8,4,4,8];kernel[]->latency[4,2,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --select-rar-dep="{kernel[]->__pet_ref_4[1]}"

Array 7: [o,i]
^^^^^^^^^^^^^^     

This is an input-stationary array.
The input feature maps are reused vertically. Weights are directly sent to each PE.

.. image:: images/cnn6_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[6];kernel[]->array_part[8,4,4,4];kernel[]->latency[2,2,4];kernel[]->simd[1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space

Array 8: [r,c]
^^^^^^^^^^^^^^

This is an output-stationary array. Input feature maps are directly sent to each PE.
Weights are reused vertically.

.. image:: images/cnn7_array.png
    :width: 300
    :align: center  

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[7];kernel[]->array_part[4,4,8,8];kernel[]->latency[2,2,2];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls

Array 9: [r,i]
^^^^^^^^^^^^^^ 

This is an input stationary array.
Weights are reused vertically. Input feature maps are sent to each PE.

.. image:: images/cnn8_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[8];kernel[]->array_part[4,8,8,4];kernel[]->latency[2,2,2];kernel[]->simd[1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space


Array 10: [c,i]
^^^^^^^^^^^^^^^

This is an input stationary array. 
Weights are reused vertically. Input feature maps are sent to each PE.

.. image:: images/cnn9_array.png
    :width: 300
    :align: center    

.. code:: bash

    ./autosa ./autosa_tests/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[9];kernel[]->array_part[4,8,8,4];kernel[]->latency[2,2,2];kernel[]->simd[1,1,2]}" \
    --simd-info=./autosa_tests/cnn/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space \
    --select-rar-dep="{kernel[]->__pet_ref_4[1]}"

================================================
FILE: docs/examples/cnn_large.rst
================================================
Convolutional Neural Network (Single Layer, Large)
==================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size matrix multiplication.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/cnn``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[64,56,14,64];kernel[]->latency[4,4,7];kernel[]->simd[1,1,8]}" \
    --simd-info=./autosa_tests/large/cnn/simd_info.json \
    --host-serialize \
    --no-reverse-order \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/cnn/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/cnn/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[64,56,14,64];kernel[]->latency[4,4,7];kernel[]->simd[1,1,8]}" \
    --simd-info=./autosa_tests/large/cnn/simd_info.json \
    --no-reverse-order \
    --host-serialize

Now instead of HLS host code, an OpenCL host code is generated.   

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/cnn/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/cnn/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

To generate the bitstream, use the following command.

.. code:: bash

    make all

After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

.. note::
    
    As this design is rather large, Vitis fails to successfully route the design on-board
    in our experiment.
    We will rely on AutoBridge to route this design.

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The reference AutoBridge scripts used for this example can be found at ``${AUTOSA_ROOT}/autosa_tests/large/cnn``.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | N/A | N/A             | N/A              | N/A          | N/A           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 265 | 884520 (57.93%) | 1445020 (46.05%) | 697 (29.84%) | 8960 (72.99%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | GFLOPs  |
+-------------+-----------------+---------------+---------+
| Unoptimized | N/A             | N/A           | N/A     |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.015865        | 0.188105      | 932.714 |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/examples/dnn_ops.rst
================================================
DNN Operators (Small)
=====================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

We demonstrate three operators using in the DNN, including:
depth-wise convolution, point-wise convolution, and fully-connected layers.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/dnn_ops``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

Point-wise Convolution
----------------------

In ``${AUTOSA_ROOT}/autosa_tests/dnn_ops/kernel.h``, uncomment the macro:

.. code:: c

    #define PC

Run the following command to generate a design with HLS host.

.. code:: bash

    ./autosa ./autosa_tests/dnn_ops/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[8,8,4,8];kernel[]->latency[4,4,4];kernel[]->simd[1,1,1,2]}" \
    --simd-info=./autosa_tests/dnn_ops/pc_simd_info.json \
    --host-serialize \
    --no-reverse-order \
    --hls

This leads to a 2x2 systolic array.
The figure below shows the array architecture.

.. image:: images/pconv.png
    :width: 400
    :align: center

You will find all generated files under the directory
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/dnn_ops/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.    

Depth-wise Convolution
----------------------

In ``${AUTOSA_ROOT}/autosa_tests/dnn_ops/kernel.h``, uncomment the macro:

.. code:: c

    #define DC

Run the following command to generate a design with HLS host.

.. code:: bash

    ./autosa ./autosa_tests/dnn_ops/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[4,4,4,3];kernel[]->latency[1,2,1];kernel[]->simd[1,2,1,1]}" \
    --simd-info=./autosa_tests/dnn_ops/dc_simd_info.json \
    --host-serialize \
    --no-reverse-order \
    --simd-touch-space \
    --hls

This leads to a 2x2 systolic array.
The figure below shows the array architecture.

.. image:: images/dconv.png
    :width: 400
    :align: center

You will find all generated files under the directory
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/dnn_ops/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully. 

Fully-Connected Layer
---------------------

In ``${AUTOSA_ROOT}/autosa_tests/dnn_ops/kernel.h``, uncomment the macro:

.. code:: c

    #define FC

Run the following command to generate a design with HLS host.

.. code:: bash

    ./autosa ./autosa_tests/dnn_ops/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[8,4];kernel[]->latency[4];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/dnn_ops/fc_simd_info.json \
    --host-serialize \
    --no-reverse-order \
    --simd-touch-space \
    --local-reduce \
    --reduce-op="+" \
    --hls

This leads to a 2x2 systolic array.
The figure below shows the array architecture.

.. image:: images/fc.png
    :width: 400
    :align: center

You will find all generated files under the directory
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/dnn_ops/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully. 

Discussion
----------

Instead of generating three seperate systolic arrays for each operator, an ideal case would be 
using one systolic array to support all three operators at the same time.
One of the solutions is to fuse the generated designs from AutoSA manually with proper 
code optimization.
The other solution would be fusing the space loops during the polyhedral compilation, which is left 
as future work of AutoSA.

================================================
FILE: docs/examples/index.rst
================================================
AutoSA Examples
===============

This page covers a list of design exmaples to get you familiar with the AutoSA 
compilation process. Examples are divided into two categories: 

* Small Designs: These designs are limited in the problem size so that you could 
  easily verify and synthesize the design within hours.
* Large Designs: These designs are used for demonstrating the performance of AutoSA-generated
  designs, and it may take more than days for verification and synthesis flow.

Small Designs
-------------

.. toctree::
    :maxdepth: 1

    mm
    cnn
    lu
    mm_int16
    mm_hbm
    dnn_ops

Large Designs 
-------------

.. toctree::
    :maxdepth: 1

    mm_large
    cnn_large
    mm_int16_large
    mm_int8_large
    mttkrp_large
    ttmc_large

================================================
FILE: docs/examples/lu.rst
================================================
LU Decomposition (Small)
========================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of small-size LU decomposition. 
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/lu``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/lu/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[-1,-1,-1];kernel[]->latency[]}" \
    --simd-info=./autosa_tests/lu/simd_info.json \
    --use-cplusplus-template \
    --no-reschedule \
    --hls

.. note:: 

    Compared to other examples, for LU decomposition, we add some additional arguments.
    ``--use-cplusplus-template``: This argument enables AutoSA to generate C code using 
    C++ template as different PEs will have different functionalities in this array.
    ``--no-reschedule``: This is due to the limtation of current ISL scheduler which 
    will generate a new program without any permutable loops that prohibit the transformation
    to systolic arrays. Therefore, we disable the ISL auto-scheduling in this application.

    Besides, the input source code has been modified to make sure all dependences are uniform.
    AutoSA lacks the ability to automatically uniformize the program and requires human
    modification for such cases.

After compilation, you will find all generated files under the directory
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/lu/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/lu/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[-1,-1,-1];kernel[]->latency[]}" \
    --simd-info=./autosa_tests/lu/simd_info.json \
    --use-cplusplus-template \
    --no-reschedule

Now instead of HLS host code, an OpenCL host code is generated.  

Please refer to other examples for the instructions on using Xilinx Vitis for generating the bitstream.

================================================
FILE: docs/examples/mm.rst
================================================
Matrix Multiplication (Small)
=============================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of small-size matrix multiplication. 
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/mm``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.

RTL Simulation
--------------

If you need to verify the design using RTL simulation.
There are two more jobs to do.

Modify the Kernel Code
^^^^^^^^^^^^^^^^^^^^^^

Open the kernel code ``${AUTOSA_ROOT}/autosa.tmp/output/src/kernel_kernel.cpp``.
Locate to the top function ``void kernel0(A_t16 *A, B_t16 *B, C_t16 *C)``.
You should see the following directives for mapping three global pointers to 
different AXI buses.

.. code:: c

    #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A
    #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B
    #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C

To run RTL simulation, we will need to assign the *depth* of each AXI bus explictly.
Refer to the host code ``kernel_host.cpp`` for the size of each array.
As we have applied host serialization, the array size might be slightly larger than 
the original array. In this example, the array A, B, C are allocated with sizes of 
16384, 16384, and 4096. Since each array is packed by 16 elements,
the depths of each array are 16384/16=1024, 16384/16=1024, 4096/16=256, respectively.
Modify the directives above to:

.. code:: c

    #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A depth=1024
    #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B depth=1024
    #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C depth=256

Modify the TCL script
^^^^^^^^^^^^^^^^^^^^^

Open the TCL script ``hls_script.tcl``.
Uncomment the last a few steps:

.. code:: tcl

    csim_design
    csynth_design
    cosim_design

* ``csim_design`` is for C simulation.
* ``csynth_design`` is for C synthesis that synthesizes C code to RTL.
* ``cosim_design`` is for RTL simulation.

We have also provided two more options in the TCl script.

* ``cosim_design -trace_level all`` is for RTL simulation while dumping out all waveforms.
* ``cosim_design -setup -trace_level all`` is for RTL simulation that only prepares the 
  simulation scripts without actually launching the simulation.

Now run the TCL script again.

.. code:: bash

    vivado_hls -f hls_script.tcl

We will perform C simulation, C synthesis, RTL simulation in order.
It will take a few minutes to finish the entire flow.
You should be able to see the following information printed in your terminal showing 
that RTL simulation finishes successfully.

.. code:: bash

    INFO: [COSIM 212-1000] *** C/RTL co-simulation finished: PASS ***

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize

Now instead of HLS host code, an OpenCL host code is generated.

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/mm/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u250_xdma_201830_2``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the DDR bank mapping for the design. 
By default, we map pointers A, B, C to DDR bank 0, 1, 2.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

.. note:: 

    When using Vitis flow to perform RTL simulation, nothing needs to change in the source code.
    You may directly set the ``MODE`` to ``hw_emu`` and perform RTL simulation.
    However, by default, we will run the kernel 10 times to collect the average runtime.
    This may significantly prolong the simulation time. Consider reducing the kernel
    launching times to 1 before using RTL simulation.

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

It will take a few hours to finish. After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

Auto-Tuning
-----------

We have provided an auto-tuner in the alpha version. 
The auto-tuner builds analytical models for resource usage and latency. 
Based on these models, the auto-tuner looks for designs with the least latency 
under the resource constraints.

Training Resource Models
^^^^^^^^^^^^^^^^^^^^^^^^

To use the auto-tuner, the first step is to train te resource models.
Run the command below to train the resoruce model.

.. code:: bash

    export AUTOSA_ROOT=$(pwd)
    python3 ./autosa_scripts/optimizer.py \
    -c './autosa ./autosa_tests/mm/kernel.c --target=autosa_hls_c --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --sa-sizes="{kernel[]->space_time[3]}"' \
    --info autosa_config/hw_info.json \
    -s autosa_config/optimizer_settings.json \
    --train \
    -p xilinx

.. note:: 

    Please don't forget to set up the environment variable ``AUTOSA_ROOT`` to your 
    AutoSA root directory before running the auto-tuner.

The auto-tuner requires a minimal AutoSA compilation command to start.
We use the command below.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c --target=autosa_hls_c --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --sa-sizes="{kernel[]->space_time[3]}"

As you may notice, we will need to assign the ``space_time`` to select the exact 
dataflow for auto-tuning. This is due to the reason that compiling different dataflows 
requires some additional flags as we will discuss in the next section.
As for now, we use the output-stationary 2D array with the argument ``--sa-sizes="{kernel[]->space_time[3]``.

``hw_info.json`` sepecifies the hardware resource constraints of the target FPGA board.
``optimizer_settings.json`` is the auto-tuner configuration file. 
More details about these options are covered in :ref:`auto-tuning-label`.

As the training phase will allocate many temporary files, you may consider 
adding the flag ``--tmp-dir`` to store the intermediate files in some other directories.

Once you launch the auto-tuner in the trainning phase, the auto-tuner will randomly
sample the design space and collect a few training samples. These training samples 
will be synthesized using HLS. We will then build resource models using linear regression
with these training samples.

This script will launch multiple processes to synthesize HLS designs. 
By default, we use 16 processes.
The training process takes around 10 minutes to finish on our workstation.

We also evaluate the resource models on the test sets. 
You will see the resource model accuracy results like below printed on your terminal once this step is finished.

.. image:: images/resource_model.png
    :align: center

Design Space Exploration
^^^^^^^^^^^^^^^^^^^^^^^^

In the next step, we will perform an exaustive search with pruning to find the design 
with the least latency given the resource constraints. 
We will improve the DSE with more efficient methods in the future.

The pruning strategies are set in ``optimizer_settings.json``. 
Details about this file are covered in :ref:`auto-tuning-label`.
Depending on the hardware and application, the pruning strategies might be changed.
We provide an example file for this application in ``${AUTOSA_ROOT}/autosa_config/optimizer_settings_libs/mm_small.json``.

Now use the following command to perform DSE.

.. code:: bash

    python3 ./autosa_scripts/optimizer.py \
    -c './autosa ./autosa_tests/mm/kernel.c --target=autosa_hls_c --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --sa-sizes="{kernel[]->space_time[3]}"' \
    --info autosa_config/hw_info.json \
    -s autosa_config/optimizer_settings_libs/mm_small.json \
    --search \
    -p xilinx

This script will launch multiple processes to search the design space.
By default, we use 32 processes.
The searching process takes around 3 minutes on our workstation.

You should see the detailed information about the best design printed out in your terminal like below.

.. image:: images/mm_dse.png
    :align: center

The auto-tuner will dump out the best design found during the DSE in the file 
``DSE.log``. By default, we will record the top-10 designs found by DSE.

Dataflow Exploration
--------------------

AutoSA can help you explore different dataflow choices.
As for matrix multiplication, AutoSA finds six different systolic arrays in total.
They use loop pair [i], [j], [k], [i,j], [i,k], [j,k] as space loops, respectively.
We show each of them in detail below.

Array 1: [i]
^^^^^^^^^^^^

This is a 1D systolic array using the loop i as the space loop.
The figure below shows the architecture of this array.

.. image:: images/gemm0_array.png
    :width: 400
    :align: center

This is an output-stationary array. Elements of matrix C are computed locally inside 
each PE. Data of matrix B are reused across PEs. Data of matrix A are sent 
directly into each PE.

Here is an example command to compile such a design.
Note that we use ``kernel[]->space_time[0]`` to select the first design.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[0];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls

This command leads to a 1x4 1D systolic array.    

Array 2: [j]
^^^^^^^^^^^^

As you may expect, this is also an output-stationary array with loop j as the space loop.
This array is symmetric to the first array. 
The figure below shows the detailed architecture.

.. image:: images/gemm1_array.png
    :width: 400
    :align: center

Elements of matrix C are computed locally inside each PE. Data of matrix A are reused 
across PEs. Data of matrix B are sent directly to each PE.

Here is an example command to compile such a design.
Note that we use ``kernel[]->space_time[1]`` to select the second design.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[1];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls

This command leads to a 1x4 1D systolic array.    

Array 3: [k]
^^^^^^^^^^^^

This array uses loop k as the space loop.
The figure below depicts the array architecture.

.. image:: images/gemm2_array.png
    :width: 400
    :align: center

This is an input-stationary array. Elements of matrix C are accumulated along 
the PEs. Data of matrix A and B need to be sent to PEs directly.

Use the command below to generate such a design.
We use ``kernel[]->space_time[2]`` to select the third design.
In addition, as AutoSA has no analysis power for reduction loops. We will 
also need to provide additional information about the reduction property. 
Note that we add the argument ``--local-reduce --reduce-op="+"`` to let AutoSA know that 
this design perform the reduction along PEs, and the reduction operator is ``+``.

By default, when searching for SIMD loops, AutoSA only considers the time loops.
As the loop k is used as the space loop, we add the flag ``--simd-touch-space`` to 
add space loops into consideration in the previous command.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[2];kernel[]->array_part[4,32,32];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space \
    --array-contraction

This leads to a 1x2 1D array.

One more thing to notice here is that inside each PE, AutoSA only allocates a single register 
``local_C[1][1]`` for storing the local elements of array C. 
This is based on the facts that all time loops are parallel loops which means that 
the PE never works on the same element again. 
As we add the flag ``--array-contraction``, AutoSA will successfully apply the array 
contraction to reduce the local buffer size.
You may turn off this optimization by removing the argument ``--array-contraction``.
When array contraction is turned off, a local buffer ``local_C[32][32]``
is allocated inside each PE.

Array 4: [i,j]
^^^^^^^^^^^^^^

This is the 2D output-stationary array as used previously. 
The figure below shows the detailed architecture.

.. image:: images/gemm3_array.png
    :width: 400
    :align: center

In this array, data of matrix C are computed locally inside PEs.
Data of matrix A are reused horizontally.
Data of matrix B are reused vertically.

Below is an example command to compile such a design.
Note that we use ``kernel[]->space_time[3]`` to select the fourth design.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls

This command leads to a 2x2 2D systolic array.   

Array 5: [i,k]
^^^^^^^^^^^^^^

This array uses loops i and k as the space loops.
The figure below depicts the array architecture.

.. image:: images/gemm4_array.png
    :width: 400
    :align: center

In this array, data of matrix C are reduced horizontally. 
Data of matrix B are reused vertically. Data of matrix A are sent directly into 
each PE.

Use the command below to generate one example array.
Note that we use ``kernel[]->space_time[4]`` to select the fifth design.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space \
    --array-contraction

This command leads to a 2x2 2D array.
Similar as array 3, we add additional information about reduction properties of the application
to the compiler. To let AutoSA explore the space loop as SIMD loop, we also add the flag 
``--simd-touch-space``. And we add ``--array-contraction`` to reduce the local buffer size.

Array 6: [j,k]
^^^^^^^^^^^^^^

This array uses loops i and k as the space loops.
The figure below depicts the array architecture.
This architecture is symmetric to array 5.

.. image:: images/gemm5_array.png
    :width: 400
    :align: center

In this array, data of matrix C are reduced horizontally.
Data of matrix A are reused vertically. Data of matrix B are sent directly into 
each PE.

Use the command below to generate one example array.
Note that we use ``kernel[]->space_time[5]`` to select the fifth design.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[5];kernel[]->array_part[32,4,32];kernel[]->latency[16,16];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls \
    --local-reduce \
    --reduce-op="+" \
    --simd-touch-space \
    --array-contraction

This command leads to a 2x2 2D array.

================================================
FILE: docs/examples/mm_block_sparse.rst
================================================


================================================
FILE: docs/examples/mm_hbm.rst
================================================
Matrix Multiplication with HBM (Small) 
======================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of a small-size matrix multiplication in using high-bandwidth memory (HBM).
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_hbm``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U280                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/mm_hbm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2];kernel[]->hbm_A[2];kernel[]->hbm_B[2];kernel[]->hbm_C_drain[2]}" \
    --simd-info=./autosa_tests/mm_hbm/simd_info.json \
    --hbm \
    --hls

.. note::

    Host serialization is not supported for HBM designs.

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.    

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_hbm/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.

To utilize the HBM, currently, we simply partition the I/O modules for each I/O group 
and assign them to multiple HBM ports. As you may notice, we add the argument
``--sa-sizes="{kernel[]->hbm_A[2];kernel[]->hbm_B[2];kernel[]->hbm_C_drain[2]}"``
to assign the I/O group ``A``, ``B``, ``C_drain`` to 2 HBM ports each.

The figure below shows the array architectures using one or two HBM/DDR banks. 

.. image:: images/array_hbm.png
    :align: center

Notice that please use the I/O group name when assigning the HBM ports.
During the compilation, AutoSA will print all the I/O groups in the array.
For more information about I/O groups, please refer to :ref:`construct-and-optimize-array-label`.    

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/mm_hbm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[32,32,32];kernel[]->latency[8,8];kernel[]->simd[2];kernel[]->hbm_A[2];kernel[]->hbm_B[2];kernel[]->hbm_C_drain[2]}" \
    --simd-info=./autosa_tests/mm_hbm/simd_info.json \
    --hbm

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_hbm/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/mm_hbm/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u280_xdma_201920_3``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the HBM bank mapping for the design. 
As we partition the array A, B, C to 2 HBM banks each,
we assign the newly generated pointers A_0, A_1, B_0, B_1, C_0, C_1 to 
HBM bank 0, 1, 2, 3, 4, 5.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

It will take a few hours to finish. After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

================================================
FILE: docs/examples/mm_int16.rst
================================================
Matrix Multiplication in int16 (Small) 
======================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of a small-size matrix multiplication in int16.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_int16``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/mm_int16/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm_int16/simd_info.json \
    --host-serialize \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_int16/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.  

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/mm_int16/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm_int16/simd_info.json \
    --host-serialize

Now instead of HLS host code, an OpenCL host code is generated.  

Please refer to other examples for the instructions on using Xilinx Vitis for generating the bitstream.

================================================
FILE: docs/examples/mm_int16_large.rst
================================================
Matrix Multiplication in int16 (Large)
======================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size matrix multiplication in int16.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/mm_int16``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/mm_int16/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,32];kernel[]->latency[16,16];kernel[]->simd[32]}" \
    --simd-info=./autosa_tests/large/mm_int16/simd_info.json \
    --host-serialize \
    --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}" \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int16/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/mm_int16/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,32];kernel[]->latency[16,16];kernel[]->simd[32]}" \
    --simd-info=./autosa_tests/large/mm_int16/simd_info.json \
    --host-serialize \
    --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}"

Now instead of HLS host code, an OpenCL host code is generated.   

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int16/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int16/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u250_xdma_201830_2``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the DDR bank mapping for the design. 
By default, we map pointers A, B, C to DDR bank 0, 1, 3.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

.. note:: 

    When using Vitis flow to perform RTL simulation, nothing needs to change in the source code.
    You may directly set the ``MODE`` to ``hw_emu`` and perform RTL simulation.
    However, by default, we will run the kernel 10 times to collect the average runtime.
    This may significantly prolong the simulation time. Consider reducing the kernel
    launching times to 1 before using RTL simulation.

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

.. note::
    
    As this design is rather large, Vitis fails to successfully route the design on-board
    in our experiment.
    We will rely on AutoBridge to route this design. 

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | N/A | N/A             | N/A              | N/A          | N/A           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 261 | 607442 (39.78%) | 836031 (26.53%)  | 1655 (70.85%)| 8192 (66.75%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | TOPs    |
+-------------+-----------------+---------------+---------+
| Unoptimized | N/A             | N/A           | N/A     |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.000625233     | 0.0095829     | 3.435   |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/examples/mm_int8_large.rst
================================================
Matrix Multiplication in int8 (Large)
=====================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size matrix multiplication in int8.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/mm_int8``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/mm_int8/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[264,256,64];kernel[]->latency[11,32];kernel[]->simd[64]}" \
    --simd-info=./autosa_tests/large/mm_int8/simd_info.json \
    --host-serialize \
    --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}" \
    --no-isl-sink \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int8/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/mm_int8/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[264,256,64];kernel[]->latency[11,32];kernel[]->simd[64]}" \
    --simd-info=./autosa_tests/large/mm_int8/simd_info.json \
    --host-serialize \
    --data-pack-sizes="{kernel[]->A[32,32,64];kernel[]->B[32,32,64];kernel[]->C[32,32,64]}" \
    --no-isl-sink

Now instead of HLS host code, an OpenCL host code is generated.   

As for int8, we notice that the default coding style for reduction trees in Xilinx HLS C 
will lead to inferior performance.
The default coding style is as below:

.. code:: c

    for (ap_uint<7> c8 = 0; c8 <= 63; c8 += 1) {
    #pragma HLS UNROLL
      local_C[c7][c6] = (local_C[c7][c6] + (local_A[0][c8] * local_B[0][c8]));
    }

If we synthesize the default PE using Vitis, each MAC is maped to one DSP and we get 64 DSPs for this 
reduction tree. 

Alternatively, if we manually unroll the reduction tree, using the following coding style,
only 32 DSPs are generated.

.. code:: c

    data_t mul_5_0_0 = local_A[0][0] * local_B[0][0];
    data_t add_5_0 = mul_5_0_0 + local_A[0][1] * local_B[0][1];
    data_t mul_5_1_0 = local_A[0][2] * local_B[0][2];
    data_t add_5_1 = mul_5_1_0 + local_A[0][3] * local_B[0][3];
    ...
    #pragma HLS RESOURCE variable=mul_5_0_0 core=Mul_LUT
    #pragma HLS RESOURCE variable=mul_5_1_0 core=Mul_LUT
    ...
    local_C[c7][c6] += add_0_0;

As you may notice, we map half the multipliers to LUTs instead. 
This helps to balance the resource usage of this design and enables us to place more 
PEs on-chip.

This part can't be done automatically at present, we provide a simple Python script 
to generate this code, and the user will have to replace the code manually in the design code.

As an example, find the script at ``${AUTOSA_ROOT}/autosa_tests/large/mm_int8/unroll.py``.
Modify the parameter ``UNROLL_FACTOR`` and ``DATA_T`` according to your current design.
Then, run:

.. code:: bash

    python3 unroll.py | tee code.c

Now copy the code in ``code.c`` to replace the original reduction loop in ``kernel_kernel.c``.
We have also provided an example file at ``${AUTOSA_ROOT}/autosa_tests/large/mm_int8/kernel_kernel_opt.cpp``.

Now you may follow the normal flow to compile the design.
We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int8/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/mm_int8/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u250_xdma_201830_2``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the DDR bank mapping for the design. 
By default, we map pointers A, B, C to DDR bank 0, 1, 3.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

.. note:: 

    When using Vitis flow to perform RTL simulation, nothing needs to change in the source code.
    You may directly set the ``MODE`` to ``hw_emu`` and perform RTL simulation.
    However, by default, we will run the kernel 10 times to collect the average runtime.
    This may significantly prolong the simulation time. Consider reducing the kernel
    launching times to 1 before using RTL simulation.

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

Below is the resource and frequency information we collected for this design.

+-----+-----------------+------------------+--------------+---------------+
| MHz | LUT             | REG              | BRAM         | DSP           |
+-----+-----------------+------------------+--------------+---------------+
| 136 | 653369 (42.80%) | 704056 (22.34%)  | 1364 (58.39%)| 6144 (50.05%) |
+-----+-----------------+------------------+--------------+---------------+

You could also test the generated design on board. We have listed the performance of the design 
in the table below.

+-----------------+---------------+---------+
| Kernel Time (s) | Host Time (s) | TOPs    |
+-----------------+---------------+---------+
| 0.000759123     | 0.0103696     | 2.917   |
+-----------------+---------------+---------+   

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | 136 | 653369 (42.80%) | 704056 (22.34%)  | 1364 (58.39%)| 6144 (50.05%) |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 300 | 730647 (47.87%) | 786680 (24.96%)  | 1364 (58.39%)| 6144 (50.05%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | TOPs    |
+-------------+-----------------+---------------+---------+
| Unoptimized | 0.000759123     | 0.0103696     | 2.917   |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.000302619     | 0.00532768    | 7.318   |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/examples/mm_large.rst
================================================
Matrix Multiplication (Large)
=============================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size matrix multiplication.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/mm``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[260,256,512];kernel[]->latency[20,16];kernel[]->simd[8]}" \
    --simd-info=./autosa_tests/large/mm/simd_info.json \
    --host-serialize \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[260,256,512];kernel[]->latency[20,16];kernel[]->simd[8]}" \
    --simd-info=./autosa_tests/large/mm/simd_info.json \
    --host-serialize

Now instead of HLS host code, an OpenCL host code is generated.   

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

Set the proper ``PLATFORM`` in the Makefile. 
By default, we set it to ``xilinx_u250_xdma_201830_2``.
You may notice that we also copy a file ``connectivity.cfg`` here.
This file assigns the DDR bank mapping for the design. 
By default, we map pointers A, B, C to DDR bank 0, 1, 3.
Lastly, modify the ``MODE`` in the Makefile for performing different tasks.

* ``sw_emu``: C simulation
* ``hw_emu``: RTL simulation
* ``hw``: Bitstream generation

.. note:: 

    When using Vitis flow to perform RTL simulation, nothing needs to change in the source code.
    You may directly set the ``MODE`` to ``hw_emu`` and perform RTL simulation.
    However, by default, we will run the kernel 10 times to collect the average runtime.
    This may significantly prolong the simulation time. Consider reducing the kernel
    launching times to 1 before using RTL simulation.

To generate the bitstream, set the ``MODE`` to ``hw`` and use the command below.

.. code:: bash

    make all

After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

.. note:: 

    As the example design is rather large, it takes approximately 40 hours to finish the synthesis on our workstation.

Below is the resource and frequency information we collected for this design.

+-----+-----------------+------------------+--------------+---------------+
| MHz | LUT             | REG              | BRAM         | DSP           |
+-----+-----------------+------------------+--------------+---------------+
| 146 | 804517 (52.69%) | 1360681 (43.17%) | 953 (40.80%) | 8320 (67.78%) |
+-----+-----------------+------------------+--------------+---------------+

You could also test the generated design on board. We have listed the performance of the design 
in the table below.

+-----------------+---------------+---------+
| Kernel Time (s) | Host Time (s) | GFLOPs  |
+-----------------+---------------+---------+
| 0.00548694      | 0.0113009     | 397.496 |
+-----------------+---------------+---------+   

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | 146 | 804517 (52.69%) | 1360681 (43.17%) | 953 (40.80%) | 8320 (67.78%) |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 300 | 803752 (52.64%) | 1325480 (42.05%) | 952 (40.75%) | 8320 (67.78%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | GFLOPs  |
+-------------+-----------------+---------------+---------+
| Unoptimized | 0.00548694      | 0.0113009     | 397.496 |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.00232357      | 0.0371066     | 938.658 |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/examples/mttkrp_large.rst
================================================
Matricized Tensor Times Khatri-Rao Product (MTTKRP) (Large)
===========================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size Matricized Tensor Times Khatri-Rao Product.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/mttkrp``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/mttkrp/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[128,128,2];kernel[]->latency[16,8];kernel[]->simd[8,1]}" \
    --simd-info=./autosa_tests/large/mttkrp/simd_info.json \
    --host-serialize \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mttkrp/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/mttkrp/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[128,128,2];kernel[]->latency[16,8];kernel[]->simd[8,1]}" \
    --simd-info=./autosa_tests/large/mttkrp/simd_info.json \
    --host-serialize    

Now instead of HLS host code, an OpenCL host code is generated.   

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mttkrp/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/mttkrp/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

To generate the bitstream, use the command below.

.. code:: bash

    make all

After the bitstream is generated,
use the following command to run it on-board.    

.. code:: bash

    make check

Below is the resource and frequency information we collected for this design.

+-----+-----------------+------------------+--------------+---------------+
| MHz | LUT             | REG              | BRAM         | DSP           |
+-----+-----------------+------------------+--------------+---------------+
| 184 | 623061 (41.53%) | 1016803 (32.58%) | 599 (26.26%) | 8192 (66.75%) |
+-----+-----------------+------------------+--------------+---------------+

You could also test the generated design on board. We have listed the performance of the design 
in the table below.

+-----------------+---------------+---------+
| Kernel Time (s) | Host Time (s) | GFLOPs  |
+-----------------+---------------+---------+
| 0.0237726       | 0.288613      | 542.006 |
+-----------------+---------------+---------+   

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | 184 | 623061 (41.53%) | 1016803 (32.58%) | 599 (26.26%) | 8192 (66.75%) |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 300 | 625001 (41.67%) | 1000623 (32.08%) | 599 (26.26%) | 8192 (66.75%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | GFLOPs  |
+-------------+-----------------+---------------+---------+
| Unoptimized | 0.0237726       | 0.288613      | 542.006 |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.0141298       | 0.174689      | 911.895 |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/examples/ttmc_large.rst
================================================
Chain of Tensor-matrix multiplications (TTMc) (Large)
=====================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This is an example of large-size Chain of Tensor-matrix multiplications.
The design files can be found at ``${AUTOSA_ROOT}/autosa_tests/large/ttmc``.
The testing environment is summarized in the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

C Simulation
------------

Run the following example command to generate one design with HLS host code.

.. code:: bash

    ./autosa ./autosa_tests/large/ttmc/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[16,64,16,32];kernel[]->latency[1,8,8];kernel[]->simd[8,1]}" \
    --simd-info=./autosa_tests/large/ttmc/simd_info.json \
    --host-serialize \
    --hls

After compilation, you will find all generated files under the directory 
``${AUTOSA_ROOT}/autosa.tmp/output/src``. 
Copy the ``hls_script.tcl`` to the directory ``autosa.tmp/output``.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/ttmc/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to perform C simulation.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output/
    vivado_hls -f hls_script.tcl

You should see ``Passed`` printed out in your terminal showing that 
C simulation is performed successfully.   

Bitstream Generation
--------------------

If you need to generate the bitstream for on-board testing, simply remove the ``--hls``
flag from the previous AutoSA command.

.. code:: bash

    ./autosa ./autosa_tests/large/ttmc/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[4];kernel[]->array_part[16,64,16,32];kernel[]->latency[1,8,8];kernel[]->simd[8,1]}" \
    --simd-info=./autosa_tests/large/ttmc/simd_info.json \
    --host-serialize

Now instead of HLS host code, an OpenCL host code is generated.   

We have prepared a template Makefile for Xilinx Vitis tools.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/ttmc/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/ttmc/connectivity.cfg ${AUTOSA_ROOT}/autosa.tmp/output/

To generate the bitstream, use the command below.

.. code:: bash

    make all

After the bitstream is generated, use the following command to run it on-board.    

.. code:: bash

    make check

Below is the resource and frequency information we collected for this design.

+-----+-----------------+------------------+--------------+---------------+
| MHz | LUT             | REG              | BRAM         | DSP           |
+-----+-----------------+------------------+--------------+---------------+
| 201 | 621584 (41.43%) | 1016231 (32.57%) | 479 (21.01%) | 8192 (66.75%) |
+-----+-----------------+------------------+--------------+---------------+

You could also test the generated design on board. We have listed the performance of the design 
in the table below.

+-----------------+---------------+---------+
| Kernel Time (s) | Host Time (s) | GFLOPs  |
+-----------------+---------------+---------+
| 0.168946        | 1.8771        | 610.131 |
+-----------------+---------------+---------+   

Using AutoBridge to Boost Frequency
-----------------------------------

You may also try to use `AutoBridge <https://github.com/Licheng-Guo/AutoBridge>`_ 
to boost the design frequency.
We cover how to use AutoBridge to improve the frequency in :ref:`use-autobridge-label`.

The tables below show the detailed comparison results between the original design 
(unoptimized) and the design optimized with AutoBridge (optimized).

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | 201 | 621584 (41.43%) | 1016231 (32.57%) | 479 (21.01%) | 8192 (66.75%) |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 300 | 622878 (41.53%) | 1010672 (32.40%) | 479 (21.01%) | 8192 (66.75%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | GFLOPs  |
+-------------+-----------------+---------------+---------+
| Unoptimized | 0.168946        | 1.8771        | 610.131 |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.112436        | 1.25489       | 916.781 |
+-------------+-----------------+---------------+---------+

================================================
FILE: docs/index.rst
================================================
.. AutoSA documentation master file, created by
   sphinx-quickstart on Sun Jan 17 15:06:11 2021.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

Welcome to AutoSA's documentation!
==================================

AutoSA is an end-to-end systolic array compiler for FPGAs based on the polyhedral model. 
It takes algorithms in high-level programming languages (C) as inputs, 
performs polyhedral transformation and other architecture optimizations to map algorithms 
to systolic array architecture. 


Getting Started
---------------

.. toctree::
   :maxdepth: 1
   
   installation
   tutorials/index
   examples/index

Resources
---------
* `AutoSA Paper <https://vast.cs.ucla.edu/sites/default/files/publications/FPGA2021_AutoSA_camera.pdf>`_
* `Github Project <https://github.com/UCLA-VAST/AutoSA>`_
* `Docker Image <https://hub.docker.com/repository/docker/whbldhwj/autosa>`_
* `FCCM 2021 Tutorial Slides <https://www.dropbox.com/s/pusu5htagdvvuch/autosa_fccm21_final.pdf?dl=0>`_

Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/install_from_source.rst
================================================
.. _install-from-source-label:

Install from Source
===================

This page gives instructions on how to build and install AutoSA from scratch.
It consists of two steps.

* `Step 1: Install the Prerequisites`_
* `Step 2: Compile AutoSA`_

Step 1: Install the Prerequisites
---------------------------------
Below we list the detailed instructions about installing the prerequisites of AutoSA.

Additionally, you could take a look at our `Dockerfile <https://github.com/UCLA-VAST/AutoSA/blob/master/Dockerfile>`_ for building the Docker image 
of AutoSA for reference instructions to build all the prerequisites on Ubuntu.

PPCG
^^^^

AutoSA is developed upon PPCG (`link <https://repo.or.cz/ppcg.git>`_).
Below are the requirements of PPCG. 

* automake, autoconf, libtool (not needed when compiling a release)
* pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config) (not needed when compiling a release using the included isl and pet)
* gmp (http://gmplib.org/)
* libyaml (http://pyyaml.org/wiki/LibYAML) (only needed if you want to compile the pet executable)
* LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html) Unless you have some other reasons for wanting to use the svn version, it is best to install the latest supported release. For more details, including the latest supported release, see pet/README.

If you are installing on Ubuntu, then you can install the following packages:

.. code:: bash

    automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm

Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
Older versions of this package did not include the required libraries.
If you are using an older version of ubuntu, then you need to compile and
install LLVM/clang from source.


Barvinok
^^^^^^^^

AutoSA also uses Barvinok library (`link <http://barvinok.gforge.inria.fr/>`_). 
Below are the requirements of Barvinok.

* NTL (https://libntl.org/)

The detailed instructions for installing NTL can be found at `link <https://libntl.org/doc/tour-unix.html>`_.
Note that NTL needs to be compiled with GMP support, this is, you have to specify

.. code:: bash

    NTL_GMP_LIP=on

NTL also needs to be compiled with ISO mode.   
For versions older than 5.4, this means you need an additional

.. code:: bash

    NTL_STD_CXX=on

Others
^^^^^^

* Python 3.6+ and the corresponding pip.

Step 2: Compile AutoSA
----------------------

After installing the prerequisites, this step will build AutoSA from source.

Get Source from Github
^^^^^^^^^^^^^^^^^^^^^^

Clone the source repo from Github.

.. code:: bash

    git clone https://github.com/UCLA-VAST/AutoSA.git

Run the Installation Script
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Run the installation script to build and install AutoSA.

.. code:: bash

    ./install.sh

After the installation has finished, to test if AutoSA is installed correctly,
you could run the following command to obtain the help information of AutoSA.

.. code:: bash

    ./autosa --help

If the help information is printed on the screen, you are all set and may start to explore 
the magic of AutoSA!    

================================================
FILE: docs/installation.rst
================================================
Installation
============

To install AutoSA, please read :ref:`install-from-source-label`. Alternatively, 
if you would like to quickly try out AutoSA, please check the 
:ref:`docker-image-label`.

.. toctree::
   :maxdepth: 1

   install_from_source
   docker_image

================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: docs/tutorials/auto_bridge.rst
================================================
.. _use-autobridge-label:

Leveraging AutoBridge to Boost the Design Frequency
===================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

AutoBridge is an automation framework to boost the FPGA design frequency. 
This page explains how to leverage AutoBridge to further boost the systolic array 
frequency on Xilinx FPGAs.

The testing environment of all the designs presented in this tutorial is described by the table below.

+--------------------------+-----------------------------------------------+
| **Target FPGA**          | Xilinx Alveo U250                             |
+--------------------------+-----------------------------------------------+
| **FPGA Synthesis Tools** | Xilinx Vivado HLS 2019.2, Xilinx Vitis 2019.2 |
+--------------------------+-----------------------------------------------+
| **CPU**                  | Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz     |
+--------------------------+-----------------------------------------------+

Introduction of AutoBridge
--------------------------

AutoBridge is a floorplanning tool based on the Vivado HLS design flow. It parses the 
Xilinx HLS designs and generates the floorplanning constraints to help boost the design frequency.
More details about this tool can be found at:

* `Github repo <https://github.com/Licheng-Guo/AutoBridge>`_
* `Paper <https://vast.cs.ucla.edu/sites/default/files/publications/AutoBridge_FPGA2021.pdf>`_

Using AutoBridge to Boost the Frequency
---------------------------------------

Please follow the instructions on AutoBrige's Github repo to install the tool.

The design example used for this tutorial can be found at the directory ``${AUTOSA_ROOT}/autosa_tests/large/mm``.

Step 0: Generating the Reference Design
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

First of all, let's generate a design directly without using AutoBridge.
Use the following command to generate the systolic array design.

.. code:: bash

    ./autosa ./autosa_tests/large/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[260,256,512];kernel[]->latency[20,16];kernel[]->simd[8]}" \
    --simd-info=./autosa_tests/large/mm/simd_info.json \
    --host-serialize

The generated designs can be found at ``${AUTOSA_ROOT}/autosa.tmp/output/src``.

Copy the Makefile and the DRAM connectivity configuration file to the project directory.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/connectivity.cfg autosa.tmp/output/

Set up your local Xilinx Vitis environment. Note that we target the Xilinx Alveo U250 in the Makefile.
Change the Makefile and connectivity file accordingly if you target a different FPGA board. 
You may also need to change the design parameters described by ``--sa-sizes`` if your target FPGA board has 
less resource than Xilinx Alveo U250.

Run the following command to synthesize the design into bitstream.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output
    make all

.. note::

    As the example design is rather large, it takes approximately 40 hours to finish the synthesis on our workstation.
    
After the synthesis is completed, you can check the design resource and frequency.
Below is the resource and frequency information we collected for this design.

+-----+-----------------+------------------+--------------+---------------+
| MHz | LUT             | REG              | BRAM         | DSP           |
+-----+-----------------+------------------+--------------+---------------+
| 146 | 804517 (52.69%) | 1360681 (43.17%) | 953 (40.80%) | 8320 (67.78%) |
+-----+-----------------+------------------+--------------+---------------+

You could also test the generated design on board. We have listed the performance of the design 
in the table below.

+-----------------+---------------+---------+
| Kernel Time (s) | Host Time (s) | GFLOPs  |
+-----------------+---------------+---------+
| 0.00548694      | 0.0113009     | 397.496 |
+-----------------+---------------+---------+

Step 1: Compiling the Design Using Vivado HLS
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Now let's use AutoBridge to generate a design with higher frequency. 

Before synthesizing the HLS design, add the pragma ``#pragma HLS dataflow disable_start_propagation`` at the top function.
In our example, open the file ``${AUTOSA_ROOT}/autosa.tmp/output/src/kernel_kernel.cpp``.
You will find the definition of the top function ``kernel0`` starting from the line 1204.

.. code:: c

    extern "C" {
    void kernel0(A_t16 *A, B_t16 *B, C_t16 *C)
    {
    #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A
    #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B
    #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C
    #pragma HLS INTERFACE s_axilite port=A bundle=control
    #pragma HLS INTERFACE s_axilite port=B bundle=control
    #pragma HLS INTERFACE s_axilite port=C bundle=control
    #pragma HLS INTERFACE s_axilite port=return bundle=control

    #pragma HLS DATAFLOW
    ...

Add the pragma ``#pragma HLS dataflow disable_start_propagation`` into the top function.
The modified code looks like below.

.. code:: c

    extern "C" {
    void kernel0(A_t16 *A, B_t16 *B, C_t16 *C)
    {
    #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A
    #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B
    #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C
    #pragma HLS INTERFACE s_axilite port=A bundle=control
    #pragma HLS INTERFACE s_axilite port=B bundle=control
    #pragma HLS INTERFACE s_axilite port=C bundle=control
    #pragma HLS INTERFACE s_axilite port=return bundle=control

    #pragma HLS DATAFLOW
    #pragma HLS dataflow disable_start_propagation
    ...

Next, copy the Xilinx HLS TCL file from the AutoBridge repo to the project directory to synthesize the C code 
to RTL using Xilinx HLS.

.. code:: bash

    cp ${AUTOBRIDGE_ROOT}/reference-scripts/step1-run-hls.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Modify the TCL file to add the information for our project. 
Specifically, modify the first four lines of ``step1-run-hls.tcl`` from

.. code:: tcl

    open_project PROJECT_NAME
    set_top TOP_FUNCTION_NAME
    add_files PATH_TO_SRC_FILE
    add_files -tb PATH_TO_TESTBENCH_FILE

to

.. code:: tcl

    open_project kernel0
    set_top kernel0
    add_files "src/kernel_kernel.cpp"
    #add_files -tb PATH_TO_TESTBENCH_FILE

Modify lines 25-26 of ``step1-run-hls.tcl`` from

.. code:: tcl

    csim_design
    csynth_design    

to 

.. code:: tcl

    #csim_design
    csynth_design    

Note that we define the target FPGA board at line 9 to Xilinx Alveo U250.
Modify it accordingly for your project.

Now call Xilinx Vivado HLS to synthesize the design.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output
    vivado_hls -f step1-run-hls.tcl

Step 2: Invoking AutoBridge to Generate Floorplanning Configuration for the Target Design
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

After the design is synthesized by HLS, we will invoke AutoBridge to analyze the project and generate 
the floorplanning constraints for the project.

AutoBridge provides a Python script for processing the HLS project automatically, which 
can be found at ``${AUTOBRIDGE_ROOT}/reference-scripts/step2-autobridge.py``.

Please refer to AutoBridge's `repo <https://github.com/Licheng-Guo/AutoBridge>`_ for more details about this script.

Normally, before running this script, we will have to modify the following fields in the script.

``project_math``: Modify it to the directory of the HLS project. As for our example, we set it as:

.. code:: Python

    project_path = '${AUTOSA_ROOT}/autosa.tmp/output/kernel0'

``top_name``: Modify it the top function of the HLS project.

.. code:: Python

    top_name = 'kernel0'

``board_name``: Modify it to the target FPGA board. AutoBridge currently supports Xilinx Alveo U250 and U280.
We use the U250 by default.

.. code:: Python

    board_name = 'u250'

``DDR_loc_2d_y``, ``DDR_loc_2d_x``: Modify them to assign the locations of the AXI modules.

In the generated HLS code, we have assigned diffrent global pointers to different AXI buses by default.
In lines 1204-1212, we have the following code:

.. code:: c

    void kernel0(A_t16 *A, B_t16 *B, C_t16 *C)
    {
    #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem_A
    #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem_B
    #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem_C
    #pragma HLS INTERFACE s_axilite port=A bundle=control
    #pragma HLS INTERFACE s_axilite port=B bundle=control
    #pragma HLS INTERFACE s_axilite port=C bundle=control
    #pragma HLS INTERFACE s_axilite port=return bundle=control

We have assigned the three global pointers ``A``, ``B``, ``C`` to three different AXI buses 
``gmem_A``, ``gmem_B``, and ``gmem_C``.

There are four DDR controllers available on U250. In this design, we will assign 
``gmem_A`` to ``DDR0``, ``gmem_B`` to ``DDR1``, and ``gmem_C`` to ``DDR3``.
We have already assigned this DDR configuration in the connectivity file ``connectivity.cfg`` we mentioned previously.

We will have to modify the AutoBridge script to reflect this mapping as well.

Modify the lines 84-111 of ``step2-autobridge.py`` as follows:

.. code:: Python

    DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
    DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
    DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
    DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

    DDR_loc_2d_y['B_IO_L3_in_serialize_U0'] = 1
    DDR_loc_2d_x['B_IO_L3_in_serialize_U0'] = 0
    DDR_loc_2d_y['kernel0_gmem_B_m_axi_U'] = 1
    DDR_loc_2d_x['kernel0_gmem_B_m_axi_U'] = 0

    DDR_loc_2d_y['C_drain_IO_L3_out_serialize_U0'] = 3
    DDR_loc_2d_x['C_drain_IO_L3_out_serialize_U0'] = 0
    DDR_loc_2d_y['kernel0_gmem_C_m_axi_U'] = 3
    DDR_loc_2d_x['kernel0_gmem_C_m_axi_U'] = 0

    DDR_loc_2d_y['kernel0_control_s_axi_U'] = 0

    DDR_enable = [1, 1, 0, 1]

For each AXI bus, HLS generates two modules that are associated with it.
First, the hardware module in the user code that accesses the data via this bus.
As for our example, in ``kernel_kernel.cpp``, the global pointer ``A`` is used by the function
``A_IO_L3_in_serialize``. Xilinx HLS will rename to the function name to ``A_IO_L3_in_serialize_U0`` after 
synthesis. AutoBridge requires the RTL module name in the script. 
You may refer to the HLS report or generated RTL to find the exact RTL module name for your design.
The second module is the AXI bus module that connects the user logic to the DDR controller. 
In our design, it is named ``kernel0_gmem_A_m_axi_U``.

AutoBridge divides the FPGA on-chip area to multiple regions. The figure below shows the 
partitioned regions for both Xilinx Alveo U250 and U280 boards.

.. image:: images/ab_map.png
    :align: center

As we can see from the figure, the on-chip logic is physically scattered by die boundaries, DDR/HBM controllers,
non-programmable logic, and other peripheral IPs. AutoBridge partitions the on-chip logic based on 
these modules. 
The partitioned regions and indices are shown in the figure on the right.

As the ``gmem_A`` is connected to ``DDR0``, we assign the locations for these modules as:

.. code:: Python

    DDR_loc_2d_y['A_IO_L3_in_serialize_U0'] = 0
    DDR_loc_2d_x['A_IO_L3_in_serialize_U0'] = 0
    DDR_loc_2d_y['kernel0_gmem_A_m_axi_U'] = 0
    DDR_loc_2d_x['kernel0_gmem_A_m_axi_U'] = 0

Similarly, we add the locations for other AXI buses as shown in the code above.

For each kernel, there is a controller with S_AXI interface.
By the recommendation of AutoBridge, we will assign it to the bottom SLR as it 
talks to the PCIe IP.

.. code:: Python
    
    DDR_loc_2d_y['kernel0_control_s_axi_U'] = 0

Lastly, we will also need to update the variable ``DDR_enable`` to reflect the DDR controllers in use.
In our example, since we only use the first, second, and fourth DDR channel, we set it as:

.. code:: Python

    DDR_enable = [1, 1, 0, 1]

We are almost done here, the final step, is to specify the maximal resource utilization ratio of each region.
As an example, we set the variable ``max_usage_ratio_2d`` as:

.. code:: Python

    max_usage_ratio_2d = [ [0.8, 0.7], [0.85, 0.75], [0.85, 0.85], [0.85, 0.7] ]

Please feel free to adjust these ratios according to the resource usage of your design.
Setting the upper bound of resource usage for each region helps guide AutoBridge to scatter 
the logic across chip which helps improve the timing. AutoBridge might fail in the case where we 
set the upper bounds lower than the required resource of the design. In that case, try to increase the 
ratio until AutoBridge can successfully place the design.
Besides, as AutoBridge uses the estimated resource from HLS reports which might 
be inconsistant with the syntheized resource usage. You may need to re-adjust these values 
if the design fails routing in the later stages.

Until now, you have a modified AutoBridge script customized for our design.
We also provide an example script at ``${AUTOSA_ROOT}/autosa_tests/large/mm/step2-autobridge.py``.

Now, execute the Python script to run AutoBridge.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/large/mm/step2-autobridge.py ${AUTOBRIDGE_ROOT}/reference-scripts/
    cd ${AUTOBRIDGE_ROOT}/reference-scripts
    ./step2-autobridge.py | tee autobridge.log

After it finishes, you should see a folder named ``autobridge`` in the same directory.
It contains the modified RTL code and the floorplanning constraint ``constraint.tcl``.
The AutoBridge-generated information is printed to ``autobridge.log``.

.. note:: 

    If AutoBridge fails, modify the ``max_usage_ratio_2d`` accordingly to make sure 
    there is enough area allocated for the design.

Step 3: Packing the Design
^^^^^^^^^^^^^^^^^^^^^^^^^^

AutoBridge modifies the HLS generated RTL. 
In this step, we will pack the modified design into an ``xo`` file that can be synthesized by Xilinx Vitis.
AutoBridge provides a TCL file for packing the design. Run the following command to pack the design.

.. code:: bash

    cp ${AUTOBRIDGE_ROOT}/reference-scripts/step3-pack-xo.tcl ${AUTOBRIDGE_ROOT}/reference-scripts/autobridge/
    
Now modify the this TCL file according to your project.

Modify the line 1 from

.. code:: tcl

    open_project PROJECT_NAME

to 

.. code:: tcl

    open_project kernel0

Modify the line 3 from 

.. code:: tcl

    export_design -rtl verilog -format ip_catalog -xo XO_NAME.xo

to 

.. code:: tcl

    export_design -rtl verilog -format ip_catalog -xo kernel0.xo

.. note::

    We also provide an example TCL file ``step3-pack-xo.tcl`` under the design example directory ``${AUTOSA_ROOT}/autosa_tests/large/mm/pack_xo.tcl``.

Before running the TCL script, we will need to copy the original HLS source files to the working directory.

.. code:: bash

    cp -r ${AUTOSA_ROOT}/autosa.tmp/output/src cd ${AUTOBRIDGE_ROOT}/reference-scripts/autobridge/

Now, run the TCL script.

.. code:: bash

    cd ${AUTOBRIDGE_ROOT}/reference-scripts/autobridge
    vivado_hls -f step3-pack-xo.tcl

After Vivado HLS finishes the packing process, you will find a file named ``kernel0.xo`` under the working directory.

Step 4: Synthesizing the Design
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The last step will be synthesizing the design to bitstream using Xilinx Vitis.
Copy the script for synthesizing the design to the working directory.

.. code:: bash

    cp ${AUTOBRIDGE_ROOT}/reference-scripts/step4-run-vitis.sh ${AUTOBRIDGE_ROOT}/reference-scripts/autobridge/

Modify the file ``step4-run-vitis.sh`` according to the design configuration.
For this example, modify line 4 from 

.. code:: bash
    
    TOP="YOUR_TOP_NAME"

to 

.. code:: bash
    
    TOP=kernel0

Modify line 10 from 

.. code:: bash
    
    XO="$(pwd)/YOUR_XO_NAME"

to 

.. code:: bash
    
    XO="$(pwd)/kernel0.xo"

Modify lines 32-35 from

.. code:: bash

    ARG_FOR_DDR_1="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_1"
    ARG_FOR_DDR_2="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_2"
    ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"
    ARG_FOR_DDR_4="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_4"

to 

.. code:: bash

    ARG_FOR_DDR_1=A
    ARG_FOR_DDR_2=B
    #ARG_FOR_DDR_3="YOUR_HLS_ARGUMENT_NAME_FOR_DDR_3"    
    ARG_FOR_DDR_4=C

Modify lines 58-61 from 

.. code:: bash

    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \
    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_3}:DDR[2] \
    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \

to 

.. code:: bash

    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_1}:DDR[0] \
    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_2}:DDR[1] \    
    --connectivity.sp ${TOP}_1.${ARG_FOR_DDR_4}:DDR[3] \

An example script of this project can be also found at ``${AUTOSA_ROOT}/autosa_tests/large/mm/step4-run-vitis.tcl``.

Now set up the Xilinx Vitis environment and run the script.

.. code:: bash

    chmod u+x ./step4-run-vitis.sh
    ./step4-run-vitis.sh

Please wait until the synthesis process is finished.    

Results Comparsion
^^^^^^^^^^^^^^^^^^

We could now compare the designs unoptimized and optimized by AutoBridge.
The tables below show the detailed comparison results.

+-------------+-----+-----------------+------------------+--------------+---------------+
| Designs     | MHz | LUT             | REG              | BRAM         | DSP           |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Unoptimized | 146 | 804517 (52.69%) | 1360681 (43.17%) | 953 (40.80%) | 8320 (67.78%) |
+-------------+-----+-----------------+------------------+--------------+---------------+
| Optimized   | 300 | 803752 (52.64%) | 1325480 (42.05%) | 952 (40.75%) | 8320 (67.78%) |
+-------------+-----+-----------------+------------------+--------------+---------------+

+-------------+-----------------+---------------+---------+
| Designs     | Kernel Time (s) | Host Time (s) | GFLOPs  |
+-------------+-----------------+---------------+---------+
| Unoptimized | 0.00548694      | 0.0113009     | 397.496 |
+-------------+-----------------+---------------+---------+
| Optimized   | 0.00232357      | 0.0371066     | 938.658 |
+-------------+-----------------+---------------+---------+

.. image:: images/autobridge.jpg
    :align: center
    
Credit: Young-kyu Choi (ykchoi@cs.ucla.edu)

================================================
FILE: docs/tutorials/auto_tuning_exhaustive.rst
================================================
.. _auto-tuning-label:

Auto-Tuning (Exhaustive Search)
===============================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

AutoSA introduces many tuning knobs during the compilation process, which form a large 
design space. To search for designs with good performance, we introduce a simple 
auto-tuner. This page introduces the basics of the auto-tuner and shows how to use
it for tuning arbitrary programs.

How Auto-Tuning Works
---------------------

First, let's take a look at the AutoSA compilation flow again, as shown in the figure below.

.. image:: images/flow.png
    :align: center

There are multiple optimization passes in the stages of computation and communication management. 
For each pass, they can either be run in the manual or auto mode.
In the manual mode, users will need to supply AutoSA with specific optimization strategies to apply on the 
program. In the auto mode, AutoSA will proceed based on the preset policy.

In the AutoSA configuration file ``${AUTOSA_ROOT}/autosa_config/autosa_config.json``, we list the steps 
that can be tunally tuned.

* **space_time**: 
  This step applies the space-time transformation to transform algorithms to systolic arrays. 
  By default, for each algorithm, multiple systolic arrays will be generated. In the auto mode,
  AutoSA will select one array based on the heuristics. In the manual mode, users will select the 
  array to be processed in the following steps.
* **array_part**: 
  This step partitions the aray into smaller sub-arrays. In the auto mode, all tilable loops 
  that can be used as array partitioning loops will be tiled with a fixed factor. In the manual mode,
  users can select loops to be tiled and provide the compiler with specific tiling factors.
* **array_part_L2**:
  AutoSA allows to generate up to two levels of array partitioning loops. This is helpful to architectures
  with many levels of memory hierarchy. Similarly, in the auto mode, AutoSA decides which loops to be further tiled and 
  selects a fixed tiling factor. Users can make such choices in the manual mode.
* **latency**:
  This step performs the latency hiding in case the innermost loop in the program carries
  dependence which prevents the design to be fully pipelined. Parallel loops in the program can be 
  used as the latency hiding candidate loops. In the auto mode, all parallel loops will be tiled and 
  the point loops will be permuted innermost. In the manual mode, users will have to specify which loops 
  to be chosen and the corresponding tiling factors.
* **simd**:
  This step vectorizes the computation inside PEs. In the auto mode, AutoSA analyzes the program
  and selects the best vectorizable loop with heuristics. In the manual mode, users will select the 
  vectorizable loop.
* **hbm**:
  AutoSA also supports HBM memory. The systolic array will be connected to multiple HBM ports.
  In the auto mode, AutoSA allocates each array to a fixed number of HBM banks. 
  In the manual mode, users select the number of HBM banks to be connected to each array.

The auto-tuner of AutoSA takes advantage of the manual modes and will explore all the possible 
combinations of the optimization strategies to search for designs with good performance.
At present, the auto-tuner supports exploration of all the stages above except the hbm stage. 
And only the Xilinx HLS C back-end is supported.

The figure below shows the working flow of the auto-tuner.

.. image:: images/auto_tuner_flow.png
    :align: center

There are two phases in the auto-tuner: *training* and *searching*.

In the training phase, the auto-tuner will generate random sample designs from the input program,
synthesizing designs using Xilinx HLS, and use them as training samples to train the resource models. 

In the searching phase, the auto-tuner will explore the design space by enumerating different
optimization strategies at each stage with pruning. The design space is explored step by step following the 
sequence of the optimization steps in the compilation flow. After the final design samples are generated, 
the auto-tuner will estimate the latency and resource usage of the design samples and update the searching record.
Eventually, the design with the best performance is selected and outputed.

In the next subsection, we will show how to use the auto-tuner to perform the design space exploration 
with the example of matrix multiplication.

Auto-Tuning Example
-------------------

The auto-tuner is written as a Python script ``${AUTOSA_ROOT}/autosa_scripts/optimizer.py``.
It can be configured by the file ``${AUTOSA_ROOT}/autosa_config/optimizer_settings.json``.

Auto-Tuner Configuration
^^^^^^^^^^^^^^^^^^^^^^^^

The configuration file ``${AUTOSA_ROOT}/autosa_config/optimizer_settings.json`` looks like below:

.. code:: json

    "training": {
      "sample": {
        "space_time": {
          "mode": "exhaustive",
          "n": -1
        },
        "array_part": {
          "mode": "random",
          "n": 2,
          "loop_limit": -1
        },
        "latency_hiding": {
          "mode": "random",
          "n": 2,
          "loop_limit": 64
        }
        ...
      },
      "pruning": {
        "array_part": {
          "enable": 1,
          "PE_num": [8, 32]
        },
        ...
        "latency_hiding": {
          "enable": 1,
          "reg_size": [16, 256]
        },
        "SIMD_vectorization": {
          "enable": 1,
          "PE_num": [8, 32],
          "PE_ratio": 2
        }
      },
      "multiprocess": {
        "n_job": 1
      }
    },    
    "synth": {
      "multiprocess": {
        "n_job": 16
      },
      "sample": {
        "n": 16
      }
    },
    "search": {
      "metric": "latency",
      "cycle_period": 5,
      "mode": "customized",
      "n_random": 5,
      "log": {
        "n_record": 10
      },
      "resource_target": ["BRAM18K", "DSP"],
      "time_out": -1,
      "update_time_interval": 2,
      "multiprocess": {
        "n_job": 32
      },
      "sample": {
        "space_time": {
          "mode": "exhaustive",
          "n": -1
        },
        ...
        "SIMD_vectorization": {
          "mode": "exhaustive",
          "n": -1,
          "loop_limit": 8
        }
      },
      "pruning": {
        "random_start": {
          "enable": 1,
          "n_trial": 3,
          "n_random": 3
        },
        "resource": {
          "range": {
            "FF": [0.25, 0.7],
            ...
            "URAM": [0, 0.6]
          }
        },
        "array_part": {
          "enable": 1,
          "PE_num": [190, 210]
        },
        ...
        "latency_hiding": {
          "enable": 1,
          "reg_size": [64, 1280]
        },
        "SIMD_vectorization": {
          "enable": 1,
          "PE_num": [190, 210],
          "PE_ratio": 3
        }
      }
    }

We will explain the configuration in detail now. At the top level, there are three sections: 
``training``, ``synth``, and ``search``.

* ``training``: configures how the auto-tuner generates the training samples for resource models.
* ``synth``: configures how the auto-tuner synthesizes the training samples.
* ``search``: configures how the auto-tuner searches the design space.

Training
""""""""

Under the subsection of ``training``, there are three fields:
``sample``, ``pruning``, and ``multiprocess``.

* ``sample``: configures how the auto-tuner samples the design space to generate training samples.
* ``pruning``: configures how the auto-tuner prunes the design space while generating the training samples.
* ``multiprocess``: The sampling step can be multiprocessed. This field configures how many processes to be used to execute the sampling step.

As for the ``sample`` field, we could configure how we sample the design space at each optimization step.
The table below summarizes the available attributes for each step.

+---------------+---------------------------+----------------------------------------------------------------+
| Attributes    | Values                    | Explanations                                                   |
+===============+===========================+================================================================+
| ``mode``      | ``exhaustive``, ``random``| This attributes specifies how we are generating the tiling     |
|               |                           |                                                                |
|               |                           | factors for each candidate loop. When using ``exhaustive``,    |
|               |                           |                                                                |
|               |                           | we will generate a list of all the sub-multiples of the loop   |
|               |                           |                                                                |
|               |                           | bound as the the tiling factors. When using ``random``, we     |
|               |                           |                                                                |
|               |                           | will randomly sample ``n`` factors from all the feasible tiling|
|               |                           |                                                                |
|               |                           | factors.                                                       |
+---------------+---------------------------+----------------------------------------------------------------+
| ``n``         | ``int``                   | The default value is -1. If the ``mode`` is set in ``random``, |
|               |                           |                                                                |
|               |                           | this value sets the number of candidate tiling factors         |
|               |                           |                                                                | 
|               |                           | generated for each loop.                                       |
+---------------+---------------------------+----------------------------------------------------------------+
| ``loop_limit``| ``int``                   | The default value is -1. It sets the upper bound of the tiling |
|               |                           |                                                                |
|               |                           | factors.                                                       |
+---------------+---------------------------+----------------------------------------------------------------+

For the ``pruning``, we implement several pruning options considering the characteristics of the systolic array architecture.
The table below explains these pruning options.

+--------------------+-------------+--------------------+-------------------------------------------------+
| Stage              | Attributes  | Values             | Explanations                                    |
+====================+=============+====================+=================================================+
| array_part         | ``enable``  | ``0``, ``1``       | Turn off/on the pruning at this step.           |
|                    +-------------+--------------------+-------------------------------------------------+
|                    | ``PE_num``  | [``int``, ``int``] | We prune the design space by restraining the    |
|                    |             |                    |                                                 |
|                    |             |                    | range of number of PEs of the design.           |
+--------------------+-------------+--------------------+-------------------------------------------------+
| latency_hiding     | ``enable``  | ``0``, ``1``       | Turn off/on the pruning at this step.           |
|                    +-------------+--------------------+-------------------------------------------------+
|                    | ``reg_size``| [``int``, ``int``] | Latency hiding creates local storage for storing|
|                    |             |                    |                                                 |
|                    |             |                    | the intermediate results. This attribute limits |
|                    |             |                    |                                                 | 
|                    |             |                    | the size of local storage introduced by latency |
|                    |             |                    |                                                 | 
|                    |             |                    | hiding.                                         | 
+--------------------+-------------+--------------------+-------------------------------------------------+
| SIMD_vectorization | ``enable``  | ``0``, ``1``       | Turn off/on the pruning at this step.           |
|                    +-------------+--------------------+-------------------------------------------------+
|                    | ``PE_num``  | [``int``, ``int``] | This attribute restrains the number of PEs.     |
|                    +-------------+--------------------+-------------------------------------------------+
|                    | ``PE_ratio``| ``int``            | This attribute restrains the width/height ratio |
|                    |             |                    |                                                 | 
|                    |             |                    | of the generated design. Default value is -1.   |
+--------------------+-------------+--------------------+-------------------------------------------------+ 

Lastly, for the ``multiprocess``, the field of ``n_job`` specifies how many processes to be used for the 
samping process. The default value is 1.

Synth
"""""

After generating the sample designs, we will start to synthesize these designs using 
Xilinx HLS for training the resource models.
The fields under the subsection ``synth`` configure how we synthesize the sample designs.
There are two fields for this subsection.

* ``multiprocess``: configures the number of processes used to synthesize the sample designs.
* ``sample``: configures the number of designs selected for synthesizing, default value as 16.

Search
""""""

Under the subsection of ``search``, there are the following fields:

* ``metric``: The default value is ``latency``. It specifies the metric the auto-tuner
  uses to evluate the optimal design. At present, only ``latency`` is supported. 
  The auto-tuner will select the design with the least latency.
* ``cycle_period``: The default value is ``5``, which stands for 5ns. 
  It specifies the cycle period of the designs for estimating the runtime in seconds.
* ``log``: During the design space exploration, the auto-tuner will keep the top-k designs 
  found during the searching process. This field specifies the number of records to keep.
* ``resource_target``: This a list containing the types of resources that the auto-tuner 
  will evaluate for each design point. Users may choose among ``BRAM18K``, ``DSP``, ``FF``,
  ``LUT``, and ``URAM``.
* ``time_out``: It specifies the number of minutes after which the DSE process will time out.
  When setting to -1, the DSE will terminate until the whole DSE is completed.
* ``update_time_interval``: The auto-tuner can print out the best search results found so far
  during the DSE process. This field specifies the time period that the auto-tuner updates the 
  searching progress,
* ``multiprocess``: When the multi-processing is enabled, the design space is partitioned and 
  searched by multiple processes. This field specifies the number of processes to be used 
  for searching.
* ``mode``: The searching processes can be executed in three modes: ``random``, ``exhaustive``, and 
  ``customized``. In the exhaustive mode, all the possible tiling factors will be explored during the 
  searching process. In the random mode, for each loop to be tiled, a number of random tiling factors 
  are picked. The number of random tiling factors can be specified in the following ``n_random`` field. 
  The default value will be 2. In the customized mode, the auto-tuner will use the sampling policy
  specified in the ``sample`` field below.
* ``n_random``: It specifies the number of random tiling factors to be picked per loop.
* ``sample``: It specifies the sampling policy during the DSE. The format is similar to the 
  sampling policy used during the training step. Please refer to the training subsection for details.
* ``pruning``: The auto-tuner applies multi-level pruning to speed up the searching process. We will
  cover the details of this field below.

The field of ``pruning`` contains the following attributes.

* ``random_start``: Before we start the search process, we can first perform a quick random search.
  The best design found during this phase will be used as a baseline to prune away worse designs during 
  the later stage. This step can be configured by three attributes:
  * ``enable``: configures to turn on/off this step.
  * ``n_trial``: We could run random search multiple times. This attribute configures the number of times 
    we will run the random search.
  * ``n_random``: configures the number of random tiling factors to be chosen for each loop.
* ``resource``: We can also prune designs based on the resource usage. This attribute restrains the range 
  of resource usage for valid designs.
* The rest of fields are similar to pruning fields under the subsection ``training``.

Run the Auto-Tuner
^^^^^^^^^^^^^^^^^^

After configuring the auto-tuner properly, we may start to use the auto-tuner for DSE.
The first step is to train the resource models, for the matrix multiplication example, run this
command to train the resource models.

.. code:: bash

    python3 ./autosa_scripts/optimizer.py \
    -c './autosa ./autosa_tests/mm/kernel.c --target=autosa_hls_c --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --sa-sizes="{kernel[]->space_time[3]}"' \
    --info autosa_config/hw_info.json \
    -s autosa_config/optimizer_settings.json \-
    -train \
    -p xilinx

The table below explains each argument of the command.

+---------------+---------------------------+----------------------------------------------------------------+
| Arguments     | Values                    | Explanations                                                   |
+===============+===========================+================================================================+
| ``-c``        | ``str``                   | This argument is the basic AutoSA compilation command for the  |
|               |                           |                                                                |
|               |                           | target kernel. Please note that the space_time step should be  |
|               |                           |                                                                |
|               |                           | specified explictly in the current version.                    |
+---------------+---------------------------+----------------------------------------------------------------+
| ``-i``        | ``json``                  | A JSON file that states the resource upper bound for the target|
|               |                           |                                                                |
|               |                           | FPGA board.                                                    |
+---------------+---------------------------+----------------------------------------------------------------+
| ``-s``        | ``json``                  | A JSON file specifying the auto-tuner configuration.           |
+---------------+---------------------------+----------------------------------------------------------------+
| ``-p``        | ``xilinx``                | Configures the target hardware. Currently only Xilinx FPGAs are|
|               |                           |                                                                |
|               |                           | supported.                                                     |
+---------------+---------------------------+----------------------------------------------------------------+
| ``--training``|                           | Execute the auto-tuner in training phase.                      |
+---------------+---------------------------+----------------------------------------------------------------+
| ``--search``  |                           | Execute the auto-tuner in search phase.                        |
+---------------+---------------------------+----------------------------------------------------------------+
| ``--tmp-dir`` | ``str``                   | Configures the directory to store the temporary files during   |
|               |                           |                                                                |
|               |                           | the DSE.                                                       |
+---------------+---------------------------+----------------------------------------------------------------+

After resource models are trained, run the following command to search for the best design.

.. code:: bash

    python3 ./autosa_scripts/optimizer.py \
    -c './autosa ./autosa_tests/mm/kernel.c --target=autosa_hls_c --simd-info=./autosa_tests/mm/simd_info.json --host-serialize --hls --sa-sizes="{kernel[]->space_time[3]} --tuning-method=0' \
    --info autosa_config/hw_info.json \
    -s autosa_config/optimizer_settings.json \
    --search \
    -p xilinx    


================================================
FILE: docs/tutorials/auto_tuning_genetic.rst
================================================
Auto-Tuning (Genetic Search)
===============================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This page introduces an alternative auto-tuning appraoch in addition to the exhaustive search.
This approach leverages genetic search and provides a much faster convergence speed
than the exhaustive search. 

Auto-Tuner Overview
-------------------
.. image:: images/odyssey_flow.png
    :width: 500
    :align: center

Our auto-tuner is named Odyssey (abbreviated from AUtomatic DEsign space exploration for SYstolic arrays). The figure above 
depicts the tuning flow.
Odyssey leverages AutoSA to construct the design space automatically. 
AutoSA takes in a C program that describes the target algorithm to map to systolic arrays and generates the systolic array designs in Xilinx HLS C.
We extend the AutoSA framework to generate a design description file that covers the full details of the generated hardware. 
Odyssey uses this file to create hardware performance models as symbolic expressions of the tuning parameters that can be used by the auto-tuner. 
Inside the auto-tuner, Odyssey implements a two-stage flow that starts with a mathematical programming (MP)-based optimizer that leverages 
optimization solvers with a simplified objective function to produce an initial high-quality design, followed by the evolutionary search with 
the accurate performance models.

Auto-Tuning Example
-------------------
To tune a certain design, we will first use AutoSA to generate a description file in JSON
format. For the matrix multiplication example, use the following command.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hls \
    --tuning-method=1 \
    --param-names=./autosa_tests/mm/param_names.json

Note that we will only need to specify the array to be explored using the argument 
``--sa-sizes="{kernel[]->space_time[3]}"``, and we add a new flag ``--tuning-method=1``
to instruct AutoSA to generate the required description file.

You will find a description file ``kernel3.json`` under the directory ``autosa.tmp/output/tuning``.
This file describes all the necessary information about the design used during the auto-tuning, including
the memory and computation information.

Next, we will call the auto-tuner to search the optimal configuration for this design.
Switch to the directory ``autosa_scripts/odyssey``.

.. code:: bash

    cd autosa_scripts/odyssey

Copy the design description file to the tuner directory.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa.tmp/output/tuning/kernel3.json ${AUTOSA_ROOT}/autosa_scripts/odyssey/designs/

Then call the tuner to start the searching.

.. code:: bash

    python main.py --workload=mm --stop-after-time=20 --cst=hw_cst

The flag ``stop-after-time=20`` tells the tuner to stop searching after 20 seconds.
The flag ``cst=hw_cst`` points to the hardware constraints file ``cst/hw_cst.json``.
The flag ``workload=mm`` points to the task configuration file ``workload/mm.json`` which describes the 
matrix dimensions of the problem. For this example, we set ``i=j=k=1024``.

You will find the detailed information of the optimal design found by the auto-tuner 
printed in the screen.

================================================
FILE: docs/tutorials/catapult_backend.rst
================================================
Generating Catapult HLS Design
==============================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

AutoSA can generate systolic arrays in Mentor Graphics HLS C. This page shows an example 
about generating a systolic array design in Mentor Graphics HLS C.

.. note::

    * The current Catapult HLS C back-end only supports two data types ``unsigned short`` and ``unsigned int``.    

Generating the Design
---------------------

`Catapult HLS <https://www.mentor.com/hls-lp/catapult-high-level-synthesis/>`_ is a HLS 
synthesis tool provided by Mentor Graphics which can target both FPGAs and ASICs.
AutoSA can generate the systolic array described in Catapult HLS C.
You may find more details about Catapult HLS at their website (`link <https://www.mentor.com/hls-lp/catapult-high-level-synthesis/>`_).

Generating the Source Code
^^^^^^^^^^^^^^^^^^^^^^^^^^
The example design used in this tutorial can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_catapult``.

To generate the design in Catapult HLS C, use the following command:

.. code:: bash

    ./autosa ./autosa_tests/mm_catapult/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_catapult_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize

The generated design files can be found at ``${AUTOSA_ROOT}/autosa,tmp/output/src``.
Note that apart from the C files describing the systolic array, AutoSA also emits one TCL file ``kernel_directives.tcl``.
This file is a template TCL file that covers the most instructions that will be used when compiling using Catapult HLS.
Users still need to modify it according to their own designs to achieve the best performance.

To generate and optimize a design, programmers can either do it in GUI or use the TCL file.
We will first demonstrate the GUI approach, and show a complete TCL file later.

.. note::

    Unlike Xilinx Vivado HLS or Intel OpenCL SDK, Catapult HLS encouranges programmers 
    to use GUI to develop their designs.

Using Catapult in GUI Mode
^^^^^^^^^^^^^^^^^^^^^^^^^^

After setting up your local environment for Catapult HLS properly, launch the software.

.. code:: bash

    catapult &

In the GUI window, open **Flow Manager** and select **SCVerify**, set **USE_CCS_BLOCK** to ``yes``,
as shown in the figure below.

.. image:: images/catapult_0.png
    :align: center

In the GUI window, add the following design files into the project. 

* ``kernel.h``: The original input kernel header file.
* ``kernel_host.cpp``: The host code for testing and verifying the design.
* ``kernel_kernel.h``: The header file for the host code.
* ``kernel_kernel_hw.h``: The design code describing the systolic array kernel.

Click **Input Files** in the **Synthesis Tasks** panel to add the design files in the directory ``${AUTOSA_ROOT}/autosa,tmp/output/src``.

.. image:: images/catapult_1.png
    :align: center

.. image:: images/catapult_2.png
    :align: center    

Next, click **Libraries** in the **Synthesis Tasks** to proceed to the library selection step.
Select the FPGA library properly based on your target device.
Here we select the Xilinx FPGA library and target ``Virtex-uplus`` device family. 

.. image:: images/catapult_3.png
    :align: center  

At this stage, you should be able to verify your design using software simulation.
However, the current code can't be directly used for software simulation due to some limitations of Catapult.
Open the source file ``kernel_kernel_hw.h`` and locate to line 28. Note the code:

.. code:: c:

    // while () // Please add the fifo check for C sim.

That's it. All the modules use FIFOs to transfer data between each other.
To correctly model the FIFO transactions, Catapult HLS requires us to specify the amount of 
input FIFO transactions so that this function only starts to be executed when all the 
input data are ready. Currently AutoSA is unable to generate this part automatically, 
users need to modify this code manually based on the design.

As an example, for this function ``A_IO_L3_in.run``, we have the input FIFO ``fifo_A_serialize``.
We can locate the read transaction of this FIFO at line 40. 
This transaction is surrounded by loops;

.. code:: c

    for (ac_int<3, false> c0 = 0; c0 <= 3; c0 += 1)
      for (ac_int<3, false> c1 = 0; c1 <= 3; c1 += 1)
        for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
          for (ac_int<2, false> c3 = 0; c3 <= 1; c3 += 1)
            for (ac_int<4, false> c4 = 0; c4 <= 7; c4 += 1)
              for (ac_int<2, false> c5 = 0; c5 <= 1; c5 += 1)

We could calculate the number of read transactions as :math:`4\times 4\times 4\times 2\times 8\times 2 = 2048`.

Now, replace the line 28 from

.. code:: c

    // while () // Please add the fifo check for C sim.

to 

.. code:: c

    while (fifo_A_serialize.available(2048))

We will have to modify all the functions with FIFO read transactions in the source code.

Another issue to mention is that the current coding style of AutoSA may lead to scheduling failure in the later 
stages in Catapult HLS. To be more specific, the following coding style generated by AutoSA by default 
is not friendly to Catapult.

.. code:: c

    for (int c0 = 0; ...)
      if (c0 == p0) {
        for (int c1 = 0; ...) {
          // logic 1
          ...
        }
      } else {
        for (int c1 = 0; ...) {
          // logic 2
          ...
        }
      }

In the code above, ``if`` branch contains sub loops to be computed. 
Such coding style could lead to scheduling failure with long feedback paths.
You might see the error message below when synthesizing this design in Catapult HLS in the later steps.

.. code:: bash

    Feedback path is too long to schedule design with current pipeline and clock constraints.

To get around this problem, we need to modify the code to lower the ``if`` branch inside the sub loops.

.. code:: c

    for (int c0 = 0; ...)
      for (int c1 = 0; ...)
        if (c0 == p0) {
          // logic 1
          ...
        } else {
          // logic 2
          ...
        }

We have provided a modified example at ``${AUTOSA_ROOT}/autosa_tests/mm_catapult/kernel_kernel_hw.h``    
This file has solved above two issues including adding the FIFO guards and modifying the ``if`` branch.
We will work to automate this process in the future.

To save the time, add this file into the project to replace the original one.

To perform software emulation, expand the folder of **Verification** in the **Project Files** panel and 
click **Original Design + Testbench**. Catapult HLS will compile and execute the design.
You should be able to see the message ``Passed`` in the **Message** panel if everything goes normally.

.. image:: images/catapult_4.png
    :align: center 

In the next step, click **Mapping** in the **Synthesis Tasks** panel.
This step asks you to specify the frequency target of the design.
Let's set it to 250MHz for now.

.. image:: images/catapult_5.png
    :align: center 

Click the **Apply** in the frequency setting panel to proceed.
Then click **Architecture** in the **Synthesis Tasks** panel.
Catapult HLS will infer the hierarchy of the design.
You will see a list of warning messages in the **Constraint Editor**. 
Let's fix them now.

.. image:: images/catapult_6.png
    :align: center 

These warning messages are of the same type. For example, the first warning message reads:

.. code:: text

    Resource '/kernel0/B_IO_L2_in/idx:rsc' with variable connected to multiple sub-blocks not mapped to '[DirectInput]'

Select the module ``B_IO_L2_in_inst_0`` in the **Instance Hierarchy**, expand the **Interface** folder in 
the **Module** panel. Select the interface ``idx:rsc`` and set the **Resource Type** on the right to 
``[DirectInput]``. Then click the **Apply** to apply the changes.

.. image:: images/catapult_7.png
    :align: center 

Specifically, for all the moduls generated by AutoSA, we may generate an index argument if there are 
multiple instances of this module to help distinguish between each other.
Catapult HLS requires us to map such scalar arguments to ``[DirectInput]`` explicitly.

You will need to apply these modifications one by one until all the warning messages disappear to be 
able to proceed to the next step. Here is a list of modules that need modifications:

* ``A_IO_L2_in_inst_0``
* ``A_IO_L2_in_boundary_inst_1``
* ``B_IO_L2_in_inst_0``
* ``B_IO_L2_in_boundary_inst_1``
* ``PE_inst_0_0``, ``PE_inst_0_1``, ``PE_inst_1_0``, ``PE_inst_1_1``
* ``C_drain_IO_L1_out_inst_0_0``, ``C_drain_IO_L1_out_inst_1_0``
* ``C_drain_IO_L1_out_boundary_inst_0_1``, ``C_drain_IO_L1_out_boundary_inst_1_1``
* ``C_drain_IO_L2_out_inst_0``
* ``C_drain_IO_L2_out_boundary_inst_1``

There is another type of resources we need to specify explicitly, the local buffers.
I/O modules generated by AutoSA might contain local buffers.
For example, click the module ``A_IO_L2_in_inst_0`` and expand the **Interconnect** folder 
in the **Module** panel, you will find the local buffer named ``A_IO_L2_in_local_A_inst:cns``.
We will need to assign it to FPGA BRAM explicitly. Select the **Resource Type** and select 
``Xilinx_RAMS.BLOCK_1R1W_RBW`` to map it to a dual-port BRAM. By default, Catapult HLS
will assign the property **Stage Replication** to 2, which means that the buffer will be duplicated to generate 
the double buffer logic. Please refer to the Catapult HLS document for more details about these configurations.
If you want to disable the automatic double buffer inferring, modify the **Stage Replication** to 1.

As for our design, we will need to modify the local buffers inside the following modules with **Stage Replication** as 2.

* ``A_IO_L2_in_inst_0``
* ``A_IO_L2_in_boundary_inst_1``
* ``B_IO_L2_in_inst_0``
* ``B_IO_L2_in_boundary_inst_1``

And the following modules with **Stage Replication** as 1.

* ``C_drain_IO_L1_out_inst_0_0``, ``C_drain_IO_L1_out_inst_1_0``
* ``C_drain_IO_L1_out_boundary_inst_0_1``, ``C_drain_IO_L1_out_boundary_inst_1_1``

Click the **RTL** in **Synthesis Tasks** to proceed.

Catapult HLS will schedule the design and generate RTL. 
However, the scheduler of Catapult HLS is limited and you might encounter the following scheduling failure.

.. code:: bash

    Feedback path is too long to schedule design with current pipeline and clock constraints.
    Schedule failed, sequential delay violated. List of sequential operations and dependencies:
      MEMORYREAD "for#1:for:for:for:for#2:read_mem(local_C:rsc.@)" kernel_kernel_hw.h(564,41,15)
      MEMORYWRITE "for#1:for:for:for:for#2:write_mem(local_C:rsc.@)" kernel_kernel_hw.h(564,22,15)
    Feedback path is too long to schedule design with current pipeline and clock constraints.      

Catapult fails to successfully schedule certain loops in the design. 
Now let's take a look at this loop.

.. code-block:: c
    :linenos:

    class PE {
      ...
      for (ac_int<3, false> c2 = 0; c2 <= 3; c2 += 1)
        for (ac_int<4, false> c5 = 0; c5 <= 7; c5 += 1)
          for (ac_int<4, false> c6 = 0; c6 <= 7; c6 += 1)
            for (ac_int<4, false> c7 = 0; c7 <= 7; c7 += 1) {
              ...
              #pragma unroll yes
              for (ac_int<2, false> c8 = 0; c8 <= 1; c8 += 1) 
                local_C[c7][c6] = (local_C[c7][c6] + (local_A[0][c8] * local_B[0][c8]));
            }
      ...
    }

This loop is inside the PE function to update the local variable ``local_C[c7][c6]``.
However, Catapult HLS fails to pipeline the loop and complains the dependence between the 
write access of ``local_C[c7][c6]`` at line 10 and the read access of ``local_C[c7][c6]`` at 
the same line.
However, if we take a closer look at this loop, as we have performed latency hiding by tiling and permuting 
two parallel loops ``c6`` and ``c7`` inside, and as the loop ``c8`` is unrolled, 
there shouldn't be any dependence here and the loop should be fully pipelined, as observed when 
using Xilinx HLS. 

However, since the scheduling algorithms are more conservative compared to Xilinx HLS, to 
achieve fully pipelining, we will have to mark this dependence false explicitly in Catapult HLS.
To do this, we have to modify the TCL script when compiling the design.

Catapult HLS already generated a TCL file containing all the instructions we have applied 
in the previous steps in ``${CATAPULT_PRJ}/kernel0.v1/directives.tcl``. Open the file and the edit the last line from 

.. code:: tcl

    go architect
    go allocate

to 

.. code:: tcl

    go architect
    directive set /kernel0/PE/run/for#1:for:for:for:for#2:read_mem(local_C:rsc.@) -IGNORE_DEPENDENCY_FROM {for#1:for:for:for:for#2:write_mem(local_C:rsc.@)}
    go allocate

Note that we add a directive to let Catapult ignore this dependence.
Now let's use this TCL script to recompile the design.

First, move out this TCL script

.. code:: bash

  mv ${CATAPULT_PRJ}/kernel0.v1/directives.tcl ${CATAPULT_PRJ}/

Then in the Catapult GUI, click **File** -> **Run Script**. And select the ``directives.tcl``.
Catapult HLS will recompile the design using this TCL file.

You should see the design successfully scheduled without any errors.
Now click **RTL** in the **Task Bar** panel to generate the final RTL.

One more optional step is using Catapult HLS to perform RTL simulation. This requires proper 
simulation tools installed on your workstation. Please refer to Catapult manuals for 
supported simulators. Here we use the Mentor QuestaSim. To perform RTL simulation, 
click **Verification** -> **QuestaSIM** -> **Concat RTL Verilog output 'concat_sim_rtl.v' vs Untimed C++**.

Catapult HLS will launch QuestaSIM simulator as shown in the figure below.

.. image:: images/catapult_sim.png
    :align: center   

Type in ``run -all`` to start the simulation, as shown in the figure below.

.. image:: images/catapult_sim2.png
    :align: center   

Phew! Up to now we have finished the complete flow in GUI. 
Just a few things to keep in mind when using the Catapult flow:

* Specify the FIFO guards for C simulation.
* Modify the ``if`` coding style for better scheduling.
* Explicitly specify the false dependence for better scheduling.

Using Catapult in TCL Mode
^^^^^^^^^^^^^^^^^^^^^^^^^^

All the steps we have presented in the previous sub seciton can be executed through a TCL script.
A complete TCL file for this flow can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_catapult/directives.tcl``.

Note that we have generated a template TCL file in the source directory 
``${AUTOSA_ROOT}/autosa,tmp/output/src/kernel_directives.tcl``. 
It cover the most boilerplate code. However, you will still need to modify some parts of the file such as 
the source code path and inserting the dependence assertation to successfully schedule the design.

To use TCL file for compilation, open the Catapult GUI,
click **File** -> **Run Script**, and select the TCL file.
Catapult HLS will compile the design and generate RTL.

================================================
FILE: docs/tutorials/getting_started.rst
================================================
Getting Started
===============

**Author**: Jie Wang (jiewang@cs.ucla.edu)

In this tutorial, we will give an overview of the compilation process of AutoSA 
and demonstrate it with an example.

The Compilation Flow of AutoSA
------------------------------

The figure below shows the overall compilation flow of AutoSA.

.. image:: images/flow.png
    :align: center

The input code of AutoSA is a C code that describes the algorithm to be mapped to
the systolic array. AutoSA is built on the polyhedral framework, which takes SCoP (static control of parts) 
programs as the input. In addition, AutoSA assumes that all the dependences of the input
programs have been rendered uniform before the compilation.

The example code below describes the matrix multiplication and serves as the input to AutoSA.

.. code:: c

    #pragma scop
    for (int i = 0; i < I; i++)        
      for (int j = 0; j < J; j++)   {
        C[i][j] = 0;
        for (int k = 0; k < K; k++)
          C[i][j] += A[i][k] * B[k][j];
      }
    #pragma endscop

Note that we insert the pragma

.. code:: c

    #pragma scop

before the code fragment and insert the pragma

.. code:: c

    #pragma endscop

after the code fragment to annotate the code region to be analyzed and transformed by the compiler.    

In the next step, a polyhedral representation of the input code is extracted. AutoSA 
uses `integer set library (ISL) <http://isl.gforge.inria.fr/>`_ for manipulating the polyhedral IR.
After extracting the polyhedral IR, AutoSA will perform an initial transformation of the program using the 
ISL scheduler. The ISL scheduler aims to transform the program to maximize the locality and parallelism.
The transformed program by ISL will be the input to the rest steps of AutoSA.
For more details about the ISL scheduler, please refer to the ISL manual. Readers are also 
recommended to read this paper [PLUTO08]_ for more details about the scheduling algorithm used by ISL.

The next stage, named as *legality check*, checks if the input program can legally be
mapped to a systolic array. At that stage, we simply check if all dependences are uniform.

A complete systolic array architecture consists of both the PE array and the on-chip I/O network. 
AutoSA separates the process of building these two components into two stages: 
*computation and communication management*. 
The stage of computation management constructs the PE and optimizes its micro-architecture. 
After that, the stage of communication management builds the I/O network for transferring data between PEs and the external memory. 

After the previous stages, AutoSA generates the AST from the optimized program. 
The AST is then traversed to generate the final design for the target hardware.
At present, AutoSA can generate Xilinx HLS C, Intel OpenCL, and Mentor Graphics Catapult C.

The stages of computation and communication management involve multiple optimization techniques, 
each introducing several tuning options. 
AutoSA implements tunable knobs for these techniques which can be set by users manually or tuned by an auto-tuner.

An Example
----------

The example code above can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_getting_started/kernel.c``.

Generating Hardware Code
^^^^^^^^^^^^^^^^^^^^^^^^

To compile the code to Xilinx HLS C for Xilinx Vitis toolkit, run the code below.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --host-serialize

The generated code can be found in the directory ``${AUTOSA_ROOT}/autosa.tmp/output/src/`.
For detailed information of AutoSA compilation options, please run

.. code:: bash

    ./autosa --help

or refer to `AutoSA Compilation Options`_.

Generating FPGA Bitstream
^^^^^^^^^^^^^^^^^^^^^^^^^

Set up the Xilinx Vitis development kit. Run the following commands.

.. code:: bash

    source /opt/Xilinx/Vitis/2019.2/settings64.sh
    source /opt/xilinx/xrt/setup.sh

Execute the makefile to build the design.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_getting_started/Makefile autosa.tmp/output/
    cp ${AUTOSA_ROOT}/autosa_tests/mm_getting_started/connectivity.cfg autosa.tmp/output/
    cd ${AUTOSA_ROOT}/autosa.tmp/output
    make all

.. admonition:: Makefile Options

    * ``MODE := hw_emu``: Set the build configuration mode to HW Emulation, other modes: ``sw_emu``|``hw``
    * ``PLATFORM := xilinx_u250_xdma_201830_2``: Select the target platform
    * ``KERNEL_SRC := `src/kernel_kernel.cpp`: List the kernel source files
    * ``HOST_SRC := src/kernel_host.cpp``: List the host source files

The ``connectivity.cfg`` describes the DRAM port mapping. 
For more details about how to change the DRAM port mapping, 
please refer to the Xilinx tutorials: `Using Multiple DDR Banks <https://xilinx.github.io/Vitis-Tutorials/2020-1/docs/bloom/6_using-multiple-ddr.html>`_.

Generating Xilinx HLS project
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

AutoSA also supports generate HLS projects. Add the option

.. code:: bash

    --hls

to the command when compiling the program.

AutoSA will generate an HLS host file ``${AUTOSA_ROOT}/autosa.tmp/output/src/kernel_host.cp``
instead of the OpenCL host file generated in the previous step. 
To build the HLS project, use the following commands.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_scripts/hls_scripts/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/
    cd ${AUTOSA_ROOT}/autosa.tmp/output
    vivado_hls -f hls_script.tcl

Using AutoSA in Manual Mode
---------------------------

As mentioned previously, AutoSA can be used in both *manual* and *auto* mode. 
In the auto mode, AutoSA will proceed based on the pre-set policy. In the manual mode,
AutoSA will dump out the optimization choices to users. Users will then provide AutoSA with specific optimization policy, which 
will be applied by AutoSA. 

The tunable knobs of the compilation flow are included in the configuration file
``${AUTOSA_ROOT}/autosa_config/autosa_config.json``. Currently, the following optimization 
stages can be configured in AutoSA.

* **space_time**: 
  This step applies the space-time transformation to transform algorithms to systolic arrays. 
  By default, for each algorithm, multiple systolic arrays will be generated. In the auto mode,
  AutoSA will select one array based on the heuristics. In the manual mode, users will select the 
  array to be processed in the following steps.
* **array_part**: 
  This step partitions the aray into smaller sub-arrays. In the auto mode, all tilable loops 
  that can be used as array partitioning loops will be tiled with a fixed factor. In the manual mode,
  users can select loops to be tiled and provide the compiler with specific tiling factors.
* **array_part_L2**:
  AutoSA allows to generate up to two levels of array partitioning loops. This is helpful to architectures
  with many levels of memory hierarchy. Similarly, in the auto mode, AutoSA decides which loops to be further tiled and 
  selects a fixed tiling factor. Users can make such choices in the manual mode.
* **latency**:
  This step performs the latency hiding in case the innermost loop in the program carries
  dependence which prevents the design to be fully pipelined. Parallel loops in the program can be 
  used as the latency hiding candidate loops. In the auto mode, all parallel loops will be tiled and 
  the point loops will be permuted innermost. In the manual mode, users will have to specify which loops 
  to be chosen and the corresponding tiling factors.
* **simd**:
  This step vectorizes the computation inside PEs. In the auto mode, AutoSA analyzes the program
  and selects the best vectorizable loop with heuristics. In the manual mode, users will select the 
  vectorizable loop.
* **hbm**:
  AutoSA also supports HBM memory. The systolic array will be connected to multiple HBM ports.
  In the auto mode, AutoSA allocates each array to a fixed number of HBM banks. 
  In the manual mode, users select the number of HBM banks to be connected to each array.

.. note:: 

    For more details about the optimization steps in AutoSA, please refer to the tutorial :ref:`construct-and-optimize-array-label`.

To switch between two different modes, modify the modes in ``${AUTOSA_ROOT}/autosa_config/autosa_config.json``.
For example, modify the content in ``autosa_config.json`` to

.. code:: json

    "array_part": {
        "enable": 1,
        "mode": "auto"
    }

to enable the array partitioning to execute in the auto mode. Modify it to 

.. code:: json

    "array_part": {
        "enable": 1,
        "mode": "manual"
    }

to run it in the manual mode.

Below we show how to use AutoSA in manual mode in detail.

Space-Time Transformation
^^^^^^^^^^^^^^^^^^^^^^^^^

In this step, multiple systolic arrays are generated from the input program. We will 
need to select one systolic array to proceeed. We set this step to manual mode in the 
configuration file.

.. code:: json

    "space_time": {
        "mode": "manual"
    }

Then run the command.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output

In the terminal, AutoSA displays a message.

.. code:: bash

    [AutoSA] 6 systolic arrays generated.

AutoSA also generates a file ```${AUTOSA_ROOT}/autosa.tmp/output/tuning.json``,
which includes guidance information for further optimization. In this example,
we have the content below.

.. code:: json

    "space_time": {
        "n_kernel": 6
    }

This tells the user that there are 6 different systolic array candidates generated. 
We may select one of them to proceed. 
For example, we could select the fourth candidate which is a 2D systolic array 
with the data from matrix A transferred horizontally, and data from matrix B 
transferred vertically. Each PE computes one element of ``C[i][j]`` locally, 
which is drained out at last to the external memory. 
The architecture of this array is depicted below.

.. image:: images/mm_array_opt.png
    :width: 300
    :align: center

To guide AutoSA to select this design, supply AutoSA with an additional argument.

.. code:: bash

    --sa-sizes="{kernel[]->space_time[3]}"

which tells AutoSA to select the fourth array (index starting from 0) during the space-time transformation.

Array Partitioning
^^^^^^^^^^^^^^^^^^

In this step, we will tile the space loops to partition the original array into smaller ones. The computation is then scheduled onto the sub-arrays in sequence. 
We first set this step in manual mode. Then run the command:

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}"

AutoSA displays new information on the terminal.

.. code:: bash

    [AutoSA] Appy PE optimization.
    [AutoSA] Apply array partitioning.

The ``tuning.json`` contains the content below:

.. code:: json

    "array_part": {
        "tilable_loops": [64, 64, 64],
        "n_sa_dim": 2
    }

This tells users there are three candidate loops that can be tiled. 
The upper bounds of each loop is 64. We may select any tiling factor no greater than 64. 
Besides, AutoSA only supports tiling factors as sub-multiples of the loop bounds for now. 
If the user is interested to understand which three loops are selected as the candidate loops, 
add the option ``--AutoSA-verbose`` to the command and run again.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3]}" --AutoSA-verbose

Below is the printed message from AutoSA.

.. code:: text

    domain: "{ S_0[i, j] : 0 <= i <= 63 and 0 <= j <= 63; S_1[i, j, k] : 0 <= i <= 63 and 0 <= j <= 63 and 0 <= k <= 63 }"
    child:
        context: "{ [] }"        
        child:
            schedule: "[{ S_0[i, j] -> [(i)]; S_1[i, j, k] -> [(i)] }, { S_0[i, j] -> [(j)]; S_1[i, j, k] -> [(j)] }, { S_0[i, j] -> [(0)]; S_1[i, j, k] -> [(k)] }]"
            permutable: 1
            coincident: [ 1, 1, 0 ]
            space_time: [ space, space, time ]
            pe_opt: [ array_part, array_part, array_part ]
            sched_pos: [ 0, 1, 2 ]       
            child:
                sequence:
                - filter: "{ S_0[i, j] }"
                - filter: "{ S_1[i, j, k] }"    

This is the schedule tree of the current program. More details about the schedule tree can be found
in the paper [SCHEDTREE14]_.
The first *domain* node represents the iteration domain of the input program.
The "band" node contains the partial schedule of the loops. 
In the current program, there are three loops :math:`i`, :math:`j`, and :math:`k`.
AutoSA provides verbose loop information. For example, the attribute of coincident indicates 
if the loop is parallel. The pe_opt attribute annotates the candidate loops that can be 
used for array partitioning. In this case, all three loops are tilable and can be used for 
array partitioning.

As an example, we select the tiling factors ``[16,16,16]``. Run hte command below.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16]}"

Latency Hiding
^^^^^^^^^^^^^^

This step performs latency hiding. We will select parallel loops, tile them, and permute the point 
loops innermost to hide the computation latency. 
After the previous step, we will find the content below in the `tuning.json`.

.. code:: json

    "latency": {
        "tilable_loops": [16,16]
    }

Similarly, you may add the argument `--AutoSA-verbose` to find out which loops have 
been selected as the latency hiding candidate loops.

We select the tiling factors ``[8,8]`` to proceed. Run the command below.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}"

SIMD Vectorization    
^^^^^^^^^^^^^^^^^^

In this step, we select the vectorizable loop, tile them, permute the point loop innermost.
The point loop will be unrolled by HLS at last. At present, a loop is set as the candidate loop if 
meeting the following criteria:

* It is a parallel loop or reduction loop that is annotated by users.
* All array references within the loop are stride-one or stride-zero with regard to this loop.
  
.. note::
    
    For the reduction loops, AutoSA requires users to annotate the loop manually. This 
    is done by providing a ``simd_info.json`` file to the compiler. 
    For our example, we can provide a ``simd_info.json`` file with the content below.
    
    .. code:: json

        "kernel3": {
            "reduction": ["y"]
        }

    The ``kernel[index]`` indicates the current array to be analyzed. As mentioned in the step of 
    space-time transformation, we select the 3rd array to proceed.
    The ``reduction`` attribute indicates if the candidate loop is a reduction loop.
    When running the last command
    
    .. code:: bash

        ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}"

    AutoSA will check all the non-parallel loops and prompt messages to ask if the loop is a 
    reduction loop. Alternatively, users can prepare the information in ``simd_info.json`` following the loop sequence 
    as shown in the prompted compilation message.
    
In this example, loops :math:`i` and :math:`j` have been selected as the space loops. Only the loop :math:`k` is left
which is a non-parallel loop. Therefore, we provide the attribute ``"reduction": ["y"]`` to the compiler
as the loop :math:`k` is a reduction loop.

With this information, AutoSA further checks if all array accesses under the loop :math:`k` are 
stride-one or stride-zero. Note that among three array accesses ``C[i][j]``, ``A[i][k]``, and ``B[k][j]``,
access ``C[i][j]`` is stride-zero in regard to loop :math:`k`, and ``A[i][k]`1 is stride-one.
However, ``B[k][j]`` is neither stride-one nor stride-zero. 
A layout transformation is required to make this array 
access to stride-one/zero.
AutoSA will examine the possibility of performing layout transformation to expose more
vectorization possibility. In this case, the following information will be printed in the terminal.

.. code:: bash

    [AutoSA] Array reference (R): { S_1[i, j, k] -> B[k, j] }
    [AutoSA] Layout transform: Permute dim (0) to the innermost

This indicates that AutoSA suggests to permute the first dimension of the array B to innermost to make the loop vectorizable.

.. note:: 

    In the example code, simply uncomment the line below to apply the layout transformation.

    .. code:: c

        #define LAYOUT_TRANSFORM

After modifying the input code with this layout transformation, run the following command.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8]}" --simd-info=./autosa_tests/mm_getting_started/simd_info.json

And we can find the updated ``tuning.json``.

.. code:: json

    "simd": {
        "tilable_loops": [16],
        "scores": [15],
        "legal": [1],
        "sa_dims": [2, 2]
    }

This indicates that the candidate loop has the upper bound of 16. 
We assign a score based on heuristics to each candidate loop. 
The higher the score is, the more hardware-friendly it is when being selected as the SIMD loop. 
The item legal indicates that this loop can be directly used for optimization. 
Otherwise, we will need to perform further layout transformation on the arrays used by the program to expose the SIMD opportunity. 
Since we have already applied the layout transformation, this attribute is set to 1.

We select the tiling factor ``[2]`` and proceed. Run the command below.

.. code:: bash

    ./autosa ./autosa_tests/mm_getting_started/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm_getting_started/simd_info.json

After this step, you should be able to find the files of the generated arrays in ``${AUTOSA_ROOT}/autosa.tmp/output/src``.

AutoSA Compilation Options
--------------------------

* ``--autosa-autosa, --autosa``: generate systolic arrays using AutoSA [default: yes]
* ``--autosa-block-sparse, --block-sparse``: use block sparsity [default: no]
* ``--autosa-block-sparse-ratio, --block-sparse-ratio``: block sparsity ratio (e.g., kernel[]->A[2,4])
* ``--autosa-config, --config``: AutoSA configuration file
* ``--autosa-data-pack, --data-pack``: enable data packing [default: yes]
* ``--autosa-data-pack-sizes, --data-pack-sizs``: data pack sizes upper bounds (bytes) at 
  innermost, intermediate, outermost I/O level [default: kernel[]->data_pack[8,32,64]]
* ``--autosa-double-buffer. --double-buffer``: enable double-buffering for data transfer [default: yes]
* ``--autosa-double-buffer-style, --double-buffer-style``: change double-buffering logic coding style
  (0: while loop 1: for loop) [default: 1]
* ``--autosa-fifo-depth, --fifo-depth``: default FIFO depth [default: 2]
* ``--autosa-hbm, --hbm``: use multi-port DRAM/HBM [default: no]
* ``--autosa-hbm-port-num, --hbm-port-num``: default HBM port number per array [default: 2]
* ``--autosa-hls, --hls``: generate Xilinx HLS host [default: no]
* ``--autosa-host-serialize, --host-serialize``: serialize/deserialize the host data [default: no]
* ``--autosa-insert-hls-dependence, --insert-hls-dependence``: insert Xilinx HLS dependence pragma (alpha version) [default: no]
* ``--autosa-int-io-dir, --int-io-dir``: set the default interior I/O direction (0: [1,x] 1: [x,1]) [default: 0]
* ``--autosa-io-module-embedding, --io-module-embedding``: embed the I/O modules inside PEs if possible [default: no]
* ``--autosa-loop-infinitize, --loop-infinitize``: apply loop infinitization optimization (Intel OpenCL only) [default: no]
* ``--autosa-local-reduce, --local-reduce``: generate non-output-stationary array with local reduction [default: no]
* ``--autosa-reduce-op, --reduce-op``: reduction operator (must be used with local-reduce together)
* ``--autosa-lower-int-io-L1-buffer, lower-int-io-L1-buffer``: lower the L1 buffer for interior I/O modules [default: no]
* ``--autosa-max-sa-dim, --max-sa-dim``: maximal systolic array dimension [default: 2]
* ``--autosa-output-dir, --output-dir``: AutoSA Output directory [default: ./autosa.tmp/output]
* ``--autosa-sa-sizes, --sa-sizes``: per kernel PE optimization tile sizes
* ``--autosa-sa-type=sync|async, --sa-type=sync|async``: systolic array type [default: async]
* ``--autosa-simd-info, --simd-info``: per kernel SIMD information
* ``--autosa-simd-touch-space, --simd-touch-space``: use space loops as SIMD vectorization loops [default: no]
* ``--autosa-two-level-buffer, --two-level-buffer``: enable two-level buffering in I/O modules [default: no]
* ``--autosa-uram, --uram``: use Xilinx FPGA URAM [default: no]
* ``--autosa-use-cplusplus-template, --use-cplusplus-template``: use C++ template in codegen (necessary for irregular PEs) [default: no]
* ``--autosa-verbose, --verbose``: print verbose compilation information [default: no]
* ``--autosa-hcl, --hcl``: generate code for integrating with HeteroCL [default: yes]

Bibliography
------------

.. [PLUTO08] Bondhugula, Uday, et al. "A practical automatic polyhedral parallelizer and locality optimizer." Proceedings of the 29th ACM SIGPLAN Conference on Programming Language Design and Implementation. 2008.
.. [SCHEDTREE14] Verdoolaege, Sven, et al. "Schedule trees." International Workshop on Polyhedral Compilation Techniques, Date: 2014/01/20-2014/01/20, Location: Vienna, Austria. 2014.

================================================
FILE: docs/tutorials/hcl_integrate.rst
================================================
HeteroCL Integration
====================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This page summarizes some issues when integrating AutoSA with HeteroCL.

Issue 1: Generating HCL-compatible outputs
------------------------------------------

To generate HCL-compatible code, we will need to add the flags ``--hcl --hls`` when compiling the program.
Below is the example command:

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hcl \
    --hls

Issue 2: Generating kernels with AXI Stream interface
-----------------------------------------------------

To generate AXI Stream interface, we will need to enable host serialization and generate
the HLS host by adding the flag ``--axi-stream --hls --host-serialize``.
Below is the example command:

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm/simd_info.json \
    --host-serialize \
    --hcl \
    --axi-stream \
    --hls

Issue 3: Hanging kernels (pending)
----------------------------------

The 8x8 GEMM kernel without host serialization will hang on-board.
The kernel with host serialization can pass the on-board testing.
We are still debugging this issue.
The command for this design:

.. code:: bash

    ./autosa ./autosa_tests/large/mm/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[256,256,512];kernel[]->latency[32,32];kernel[]->simd[8]}" \
    --simd-info=./autosa_tests/large/mm/simd_info.json \
    --hcl \
    --hls    

================================================
FILE: docs/tutorials/host_serialize.rst
================================================
Understanding Host Serialization
================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

AutoSA supports serializing the data on the host side to increase the memory burst length.
This technique is important in achieving high effective DRAM bandwidth. 
This page explains the mechanisms of host serialization.

How It Works
------------

Host serialization is enabled by supplying AutoSA with the flag ``--host-serialize``.
The figure below explains the current mechanisms of serialization.

.. image:: images/serialize_example.png
    :align: center

The upper part of the figure shows a piece of code that accesses a tiled matrix block by block.
Inside each block, data are loaded sequentially in row major.
We pipeline the innermost loop. The array ``A`` is stored in DRAM.

When synthesizing such a code in Xilinx HLS, HLS will automatically infer a burst length of :math:`4\times 4` for the DRAM 
access based on the inner loops.
However, this burst length is rather small to make use of the DRAM bandwidth.

The figure below from the paper [CHOI16]_ shows the profiled effective DRAM bandwidth versus burst length on Xilinx FPGAs.

.. image:: images/dram_bw.png
    :width: 500
    :align: center

As can be seen in the figure above, a minimal burst length of 128KB is required to reach the maximal effective bandwidth 
on Xilinx devices. The low burst length in the current design will lead to a rather 
low DRAM effective bandwidth that will eventually limit the performance.

This phenomemon makes it critical to perform data serialization.
The code in the middle shows the current method of data serialization implemented in AutoSA.
Simply, we will allocate a new array to hold the serialized data. The new array is filled 
based on the original data access pattern with an increasing counter.

This leads to a new matrix as shown in the bottom part of the figure. Now we can simply 
replace the original code that accesses DRAM with this new code.
HLS will then infer the burst length of :math:`2\times 2\times 4\times 4`, which is the maximal burst length 
we can achieve for this design.

As for the systolic array design, after supplying AutoSA with the flag ``--host-serialize``, 
you will notice a separate serialization module (S) created between the original outermost I/O module and the DRAM.
The figure below compares the systolic array architecture w/o and w/ data serialization.

.. image:: images/array_serialize.png
    :align: center

We plug in the serialized data access logic into these serialization modules to achieve the maximal burst length.

Pitfalls
--------

The current serialization appraoch is a temporary solution, as it will create 
redundant data in the serialized matrix which bloats the size of this matrix.
The figure below shows one of such examples.

.. image:: images/serialize_example2.png
    :align: center

In this example, when accessing the matrix, we introduce one addition level of loop ``r1`` to 
visit each tile twice before moving to the next tile.
In such a case, using the current method, we will generate a serialized matrix which is 
two times larger than the original matrix. Things will become worse if such reuse happens more often.
Please keep in mind of this shortcoming of serialization when using it in AutoSA.
We will improve it in the future.

Bibliography
------------

.. [CHOI16] Choi, Young-kyu, et al. "A quantitative analysis on microarchitectures of modern CPU-FPGA platforms." Proceedings of the 53rd Annual Design Automation Conference. 2016.

================================================
FILE: docs/tutorials/index.rst
================================================
AutoSA Tutorials
================

This page contains a series of tutorials to get you familiar with the systolic array 
architectures and the compilation process of AutoSA.

.. toctree::
    :maxdepth: 1

    theory_background
    optimize_array
    getting_started    
    matrix_multiplication
    auto_tuning_exhaustive
    auto_tuning_genetic
    auto_bridge
    structural_sparsity    
    intel_backend
    catapult_backend
    host_serialize
    hcl_integrate

================================================
FILE: docs/tutorials/intel_backend.rst
================================================
Generating Intel OpenCL Design
==============================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

AutoSA can generate systolic arrays in Intel OpenCL. This page shows an example 
about generating a systolic array design for Intel FPGAs. 

.. note:: 

    The Intel OpenCL back-end is not performant currently due to the channel overheads
    and may halt on-board for certain test cases.
    This back-end is provided only for demo purpose. 
    Please consider Xilinx or Catapult back-end for stable use.

Generating the Design
---------------------

The design example used by this tutorial is at ``${AUTOSA_ROOT}/autosa_tests/mm_intel``.
Run the following command to generate the systolic array.

.. code:: bash

    ./autosa ./autosa_tests/mm_intel/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_opencl \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->array_part_L2[2,2,2];kernel[]->latency[8,8];kernel[]->simd[2]}" \
    --simd-info=./autosa_tests/mm_intel/simd_info.json \
    --host-serialize \
    --loop-infinitize \
    --double-buffer-style=0 \
    --mem-port-map="{kernel[]->A[0];kernel[]->B[1];kernel[]->C[2]}"

After compilation, you will find the generated designs under the directory
``${AUTOSA_ROOT}/autosa.tmp/output/src``.

We also provide an example Makefile for testing the design.
Copy it to the design directory.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_intel/Makefile ${AUTOSA_ROOT}/autosa.tmp/output/

You may modify the Makefile based on your target FPGA board or use your own Makefile.
In the example Makfile, we target the Intel Stratix 10 board with HBM memory.

.. code:: bash

    AOCL_BOARD ?= s10mx_hbm_es

Set up your local Intel OpenCL SDK environment. Make sure the environment variable 
``INTELFPGAOCLSDKROOT`` is set properly. Then, to perform software emulation, run:

.. code:: bash

    make sw_emu_check

The design will be compiled and simulated on CPU. You should be able to see the following information printed on your terminal.

.. code:: bash

    AOCX file: kernel_sw_emu.aocx

    FPGA Time: 0.146633 s
    Host Time: 0.14696 s
    Passed!

which shows the design is successfully compiled and the simulation passed successfully.

To synthesize the design to RTL, run:

.. code:: bash

    make hls

The design will be synthesized to RTL. This process will take some time to finish.
Intel OpenCL SDK generates the detailed hardware information in HTML format, which 
can be found at ``${AUTOSA_ROOT}/autosa.tmp/output/bin/kernel/reports``.

Lastly, to generate the bitstream, run:

.. code:: bash

    make hw

More Details
------------

Compared to generating Xilinx HLS designs, when generating the Intel OpenCL code, we add the following 
three arguments to the compilation command.

``--loop-infinitize``: Xilinx HLS requires the loops to be bounded. Such a limitation is 
no longer required for Intel OpenCL. Loops can be eliminated if possible as the function can be 
run infinitely. Performing loop infitinization will eliminate the unnecessary outer loops 
in each function to reduce the hardware overheads.

``--double-buffer-style=0``: When generating the double buffer logic, by default, 
we will generate the ping-pong logic explicitly as you may see in the Xilinx HLS code as below.

.. code:: c

    // outer loops
    for (...)
      for (...) {
        // double buffer logic
        if (arb == 0) {
          func1(ping_array);
          func2(pong_array);
        } else if (arb == 1) {
          func1(pong_array);
          func2(ping_array);
        }
      }
      
However, such a coding style no longer works in Intel OpenCL design as Intel OpenCL SDK 
lacks the ability to identify that ``func1`` and ``func2`` can be executed in parallel.
As a temporary solution, we will modify this coding style by inlining the function contents of 
``func1`` and ``func2`` directly. By setting ``--double-buffer-style=0``, we will generate the 
functional double buffering logic for Intel OpenCL. The generated logic looks like below:

.. code:: c

    while (1) {
      if (func1_en) {
        // func1 logic
        ...
      }
      if (func2_en) {
        // func2 logic
        ...
      }      
    }

``--mem-port-map="{kernel[]->A[0];kernel[]->B[1];kernel[]->C[2]}"``: 
As the target FPGA board is equipped with HBM memory, we may assign the global pointer to 
different HBM banks. In Xilinx Vitis flow, we will write a separate configuration file 
to map global pointers to different banks. However, in Intel flow, we will need to code it 
explicitly in the OpenCL kernel code. This arugment is optional. It maps the global pointers 
``A``, ``B``, and ``C`` to bank 0, 1, and 2. You should find the following code in the OpenCL code.

.. code:: c

    __kernel void A_IO_L3_in_serialize(__global volatile __attribute__((buffer_location("HBM0"))) A_t16 *restrict A)

in which we use the ``__attribute__((buffer_location("HBM0")))`` to assign the pointer ``A`` to the bank ``HBM0``.

================================================
FILE: docs/tutorials/matrix_multiplication.rst
================================================
How Systolic Array Works: A Case Study on Matrix Multiplication
===============================================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This page gives a detailed explanation about the AutoSA generated systolic array architecture
for matrix multiplication.

Generating the Systolic Array
-----------------------------

We will use the example code in ``${AUTOSA_ROOT}/autosa_tests/mm/kernel.c``.

.. code:: c

    #pragma scop
    for (int i = 0; i < 64; i++)
      for (int j = 0; j < 64; j++) {
        C[i][j] = 0;
        for (int k = 0; k < K64; k++)
          C[i][j] = C[i][j] + A[i][k] * B[j][k];
      }
    #pragma endscop

Use the following command to generate the systolic array.

.. code:: bash

    ./autosa ./autosa_tests/mm/kernel.c --config=./autosa_config/autosa_config.json --target=autosa_hls_c --output-dir=./autosa.tmp/output --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[2]}" --simd-info=./autosa_tests/mm/simd_info.json --hls

This will generate a :math:`2\times 2` 2D systolic array as shown below.

.. image:: images/mm_array_opt.png
    :width: 300
    :align: center

Understanding the Systolic Array
--------------------------------

The systolic array architecture is composed of two parts: the processing elements (PE) and the 
I/O network. We will explain these two components in sequence.

Processing Elements (PE)
^^^^^^^^^^^^^^^^^^^^^^^^

Below is the AutoSA generated HLS code for the PE.

.. code-block:: c
    :linenos:

    /* Module Definition */
    void PE(int idx, int idy, hls::stream<A_t2> &fifo_A_in, hls::stream<A_t2> &fifo_A_out, hls::stream<B_t2> &fifo_B_in, hls::stream<B_t2> &fifo_B_out, hls::stream<float> &fifo_C_drain_out) {
    #pragma HLS INLINE OFF
      /* Variable Declaration */
      int p0 = idx, p1 = idy; // module id
      A_t1 local_A[1][2];
      #pragma HLS ARRAY_PARTITION variable=local_A dim=0 complete
      B_t1 local_B[1][2];
      #pragma HLS ARRAY_PARTITION variable=local_B dim=0 complete
      C_t1 local_C[8][8];
      #pragma HLS RESOURCE variable=local_C core=RAM_2P_BRAM
      /* Variable Declaration */

      for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
        for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1) {
          // array
          // pe
          // latency
          for (ap_uint<4> c6 = 0; c6 <= 7; c6 += 1) {
            // latency
            for (ap_uint<4> c7 = 0; c7 <= 7; c7 += 1) {
            #pragma HLS PIPELINE II=1
              // simd
              // hls_unroll
              local_C[c7][c6] = 0;
            }
          }
          for (ap_uint<3> c2 = 0; c2 <= 3; c2 += 1) {
            // array
            // pe
            for (ap_uint<4> c5 = 0; c5 <= 7; c5 += 1) {
              // latency
              for (ap_uint<4> c6 = 0; c6 <= 7; c6 += 1) {
                // latency
                for (ap_uint<4> c7 = 0; c7 <= 7; c7 += 1) {
                #pragma HLS PIPELINE II=1
                  {
                    {
                      A_t2 fifo_data;
                      fifo_data = fifo_A_in.read();
                      for (ap_uint<2> n = 0; n < 2; n++) {
                      #pragma HLS UNROLL
                        union {unsigned int ui; float ut;} u;
                        u.ui = (unsigned int)fifo_data(31, 0);
                        local_A[0][n] = u.ut;
                        fifo_data = fifo_data >> 32;
                      }
                    }
                    {
                      B_t2 fifo_data;
                      fifo_data = fifo_B_in.read();
                      for (ap_uint<2> n = 0; n < 2; n++) {
                      #pragma HLS UNROLL
                        union {unsigned int ui; float ut;} u;
                        u.ui = (unsigned int)fifo_data(31, 0);
                        local_B[0][n] = u.ut;
                        fifo_data = fifo_data >> 32;
                      }
                    }
                    // simd
                    for (ap_uint<2> c8 = 0; c8 <= 1; c8 += 1) {
                    #pragma HLS UNROLL
                      local_C[c7][c6] = (local_C[c7][c6] + (local_A[0][c8] * local_B[0][c8]));
                    }
                    if (c2 == 3 && c5 == 7)
                      fifo_C_drain_out.write(local_C[c7][c6]);
                    {
                      B_t2 fifo_data;
                      union {unsigned int ui; float ut;} u1, u0;
                      u1.ut = local_B[0][1];
                      u0.ut = local_B[0][0];
                      fifo_data = (ap_uint<32>(u1.ui), ap_uint<32>(u0.ui));
                      fifo_B_out.write(fifo_data);
                    }
                    {
                      A_t2 fifo_data;
                      union {unsigned int ui; float ut;} u1, u0;
                      u1.ut = local_A[0][1];
                      u0.ut = local_A[0][0];
                      fifo_data = (ap_uint<32>(u1.ui), ap_uint<32>(u0.ui));
                      fifo_A_out.write(fifo_data);
                    }
                  }
                }
              }
            }
          }
        }
    }
    /* Module Definition */

In this 2D systolic array, data of matrix A are reused horizontally across PEs, data of matrix B are reused vertically. Each PE computes elements of matrix C locally. After the computation is done, final results of matrix C will be drained out to the external memory.

The PE interface (line 2) contains the following components:

* Module index (``idx``, ``idy``): Indices of the PE module.
* FIFO (``fifo_A_in``, ``fifo_A_out``, ``fifo_B_in``, ``fifo_B_out``, ``fifo_C_drain_out``): FIFOs for transfering data.

While generating this array, we applied latency hiding on the orginal loops :math:`i` and :math:`j` with the factor :math:`(8,8)`, and SIMD vectorization on the loop :math:`k` with a factor of 2. With the latency hiding, each PE will compute a tile of :math:`8\times 8` of the matrix C. With the SIMD vectorization, at each cycle, two elements of matrix A and two elements of matrix B are required to update the local elements of matrix C.

With this knowledge, we could take a look at the local variable declarations in lines 5-11 now. Line 5 is simply storing the module indices. Lines 6-11 allocate local storage inside PEs for storing the data of matrix A, B, and C.

The rest of the code performs the computation. At each cycle, PE reads data of matrix A and B from neighbor PEs at lines 38-59 and passes the data to neighbor PEs at lines 67-82. PE performs the computation at lines 61-64. 
When the final results of matrix C are derived, PE writes out the final results at lines 65-66.

I/O Network
^^^^^^^^^^^

I/O network is composed of a series of I/O modules for transferring data between the external memory and PEs. We will use the I/O modules of array A as an example.

There are two types of I/O modules for array A: 

* Level-3 (L3) I/O modules: modules that read data from the external memory and send to the array.
* Level-2 (L2) I/O modules: modules that pass data between each other. Data that belong to the PEs that the module is connected to are kept locally, the rest data are passed to the downstreaming I/O modules.

Below is the code of the L3 I/O module.

.. code-block:: c
    :linenos:

    /* Module Definition */
    void A_IO_L3_in(A_t8 *A, hls::stream<A_t8> &fifo_A_local_out) {
    #pragma HLS INLINE OFF
      /* Variable Declaration */
      /* Variable Declaration */

      for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
        for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
          for (ap_uint<3> c2 = 0; c2 <= 3; c2 += 1) {
            // array
            // io_L3
            for (ap_uint<2> c3 = 0; c3 <= 1; c3 += 1) {
              // io_L2
              for (ap_uint<4> c4 = 0; c4 <= 7; c4 += 1) {
                // access_coalesce
                for (ap_uint<2> c5 = 0; c5 <= 1; c5 += 1) {
                #pragma HLS PIPELINE II=1
                {
                  A_t8 fifo_data;
                  fifo_data = A[128*c0 + 2*c2 + 64*c3 + 8*c4 + c5];
                  fifo_A_local_out.write(fifo_data);
                }
                }
              }
            }
          }
    }
    /* Module Definition */   

In this design, we apply the array partitioning on the original loops :math:`i`, :math:`j`, and :math:`k` with the factors :math:`(16,16,16)`. The orignal loop bounds for these three loops are :math:`(64,64,64)`. 
Therefore, array partitioning loops at lines 7-9 have loop bounds of :math:`(4,4,4)`. 

When transferring the data to the PEs, we will pass data through the chain of L2 I/O modules. In this design, there are two such modules. The loop for traversing the L2 I/O modules is at line L2. 
Inside each L2 I/O module, we will need to load the data tile required by the PEs that it is connected to. 

With the array partitioning factors :math:`(16,16,16)`, at each array partition, a sub tile of matrix A with the size :math:`16\times 16` is loaded from the external memory. As this array have the dimension of :math:`2\times 2`, each L2 I/O module will store a tile with the size :math:`8\times 16`.
The loops for loading the data tiles for each I/O modules can be found at lines 14-16. Note that AutoSA will pack data together to increase the I/O througput. In this case, every 8 elements are packed together. Therefore, the size of the local tile is :math:`8\times 2`, with a data width of 8 data elements.

Next, we will look at the L2 I/O module. The figure below shows the micro-architecture of the L2 I/O module.

.. image:: images/io_module_arch.png
    :width: 500
    :align: center

L2 I/O module loads data from the upstream I/O modules, keeps the data that belong to it, and sends the rest to the downstream modules. 
For I/O modules with local buffers inside, AutoSA automatically applies double buffering to overlap the data transfer betwen the I/O modules and data transfer to/from PEs. 

Below is the code of L2 I/O module.

.. code-block:: c
    :linenos:

    /* Module Definition */
    void A_IO_L2_in(int idx, hls::stream<A_t8> &fifo_A_in, hls::stream<A_t8> &    fifo_A_out, hls::stream<A_t2> &fifo_A_local_out) {
    #pragma HLS INLINE OFF
      /* Variable Declaration */
      int p0 = idx; // module id
      A_t8 local_A_ping[8][2];
      #pragma HLS RESOURCE variable=local_A_ping core=RAM_2P_BRAM
      A_t8 local_A_pong[8][2];
      #pragma HLS RESOURCE variable=local_A_pong core=RAM_2P_BRAM
      bool arb = 0;
      bool inter_trans_en = 1;
      bool intra_trans_en = 0;
      int c0, c0_prev;
      int c1, c1_prev;
      int c2, c2_prev;
      /* Variable Declaration */

      {
        for (ap_uint<3> c0 = 0; c0 <= 3; c0 += 1)
          for (ap_uint<3> c1 = 0; c1 <= 3; c1 += 1)
            for (ap_uint<3> c2 = 0; c2 <= 3; c2 += 1) {
              // array
              // io_L3
              {
                if (arb == 0) {
                  A_IO_L2_in_inter_trans(
                    /* module id */ idx, 
                    /* host iter */ c0, 
                    /* host iter */ c1, 
                    /* host iter */ c2, 
                    /* array */ local_A_pong, 
                    /* fifo */ fifo_A_in, 
                    /* fifo */ fifo_A_out, 
                    /* enable */ inter_trans_en
                  );
                  A_IO_L2_in_intra_trans(
                    /* module id */ idx, 
                    /* host iter */ c0_prev, 
                    /* host iter */ c1_prev, 
                    /* host iter */ c2_prev, 
                    /* array */ local_A_ping, 
                    /* fifo */ fifo_A_local_out, 
                    /* enable */ intra_trans_en
                  );
                } else {
                  A_IO_L2_in_inter_trans(
                    /* module id */ idx, 
                    /* host iter */ c0, 
                    /* host iter */ c1, 
                    /* host iter */ c2, 
                    /* array */ local_A_ping, 
                    /* fifo */ fifo_A_in, 
                    /* fifo */ fifo_A_out, 
                    /* enable */ inter_trans_en
                  );
                  A_IO_L2_in_intra_trans(
                    /* module id */ idx, 
                    /* host iter */ c0_prev, 
                    /* host iter */ c1_prev, 
                    /* host iter */ c2_prev, 
                    /* array */ local_A_pong, 
                    /* fifo */ fifo_A_local_out, 
                    /* enable */ intra_trans_en
                  );
                }
                intra_trans_en = 1;
                arb = !arb;
                c0_prev = c0;
                c1_prev = c1;
                c2_prev = c2;
              }
            }
        if (arb == 0) {
          A_IO_L2_in_intra_trans(
            /* module id */ idx, 
            /* host iter */ c0_prev, 
            /* host iter */ c1_prev, 
            /* host iter */ c2_prev, 
            /* array */ local_A_ping, 
            /* fifo */ fifo_A_local_out, 
            /* enable */ intra_trans_en
          );
        } else {
          A_IO_L2_in_intra_trans(
            /* module id */ idx, 
            /* host iter */ c0_prev, 
            /* host iter */ c1_prev, 
            /* host iter */ c2_prev, 
            /* array */ local_A_pong, 
            /* fifo */ fifo_A_local_out, 
            /* enable */ intra_trans_en
          );
        }
      }
    }
    /* Module Definition */    

Lines 6-9 define the double buffers inside the I/O module.
Lines 19-95 performs the double buffering to overlap the data transfer between I/O modules (defined in the function ``A_IO_L2_in_inter_trans``) and data transfer to/from PEs (defined in the function ``A_IO_L2_in_intra_trans``).

Please refer to the generated code for more details of the functions ``A_IO_L2_in_inter_trans`` and ``A_IO_L2_in_intra_trans``.

The similar principles apply to the other I/O modules. Together with both the I/O modules and PEs, we have a complete functional systolic array that can be synthesized and executed on FPGAs.

.. note:: 

    When adding the argument ``--host-serialize`` to the AutoSA command, the data of each array will be serialized on the host and transfered to the systolic array. AutoSA will introduce an additional I/O module for loading/writing the serialized data from/to the external memory before the original I/O modules. Feel free to try it out and compare with the code without serialization. The major benefit of using host serialization is to increase the DDR bus width and burst length to improve the effective DRAM bandwidth.

================================================
FILE: docs/tutorials/optimize_array.rst
================================================
.. _construct-and-optimize-array-label:

Constructing and Optimizing a Systolic Array
============================================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This page takes an in-depth look at how AutoSA constructs and optimizes a systolic array to 
achieve high performance on FPGAs. 

.. note:: 
    This page will be helpful to readers who are interested in the implementation of AutoSA. 
    More details are covered in the `AutoSA Paper <https://vast.cs.ucla.edu/sites/default/files/publications/FPGA2021_AutoSA_camera.pdf>`_.
    Feel free to skip this one if you focus on using AutoSA to generate systolic arrays only.

Prerequisites
-------------
Please finish the tutorial :ref:`theoretical-background-label` first.

A complete systolic array architecture consists of both the PE array and the on-chip 
I/O network. 
AutoSA separates the process of building these two components into two stages: 
*computation* and *communication management*. 
The stage of computation management constructs the PE and optimizes its micro-architecture. 
After that, the stage of communication management builds the I/O network for transferring data between PEs and the external memory. 
Details of these two stages will be covered in the subsequent sections, respectively.

In the subsequent sections, we use the exmaple code below to illustrate different steps.

.. code:: c

  for (int i = 0; i < M; i++)
    for (int j = 0; j < N; j++)
      for (int k = 0; k < K; k++) 
  S0:   C[i][j] += A[i][k] * B[k][j];

This code performs matrix multiplication (the initialization is omitted for brevity).
With the help of `integer set library (ISL) <http://isl.gforge.inria.fr/>`_, we can
extract the initial schedule tree of the program as shown below.

.. image:: images/mm_tree_param.png
    :width: 400
    :align: center

Computation Management
----------------------

The stage of computation management consists of four steps: 
*space-time transformation*, *array partitioning*, *latency hiding*, 
and *SIMD vectorization*. We will go though each step in the following subsections. 

Space-Time Transformation
^^^^^^^^^^^^^^^^^^^^^^^^^

This step performs the space-time transformation to map the input program to a systolic array.
Details of the space-time transformation are covered in :ref:`theoretical-background-label`.
The algorithm below describes how AutoSA applies the space-time transformation.

.. admonition:: Algorithm 1: Space-time transformation

    | Inputs: A schedule tree :math:`s` 
    | Outputs: A list of schedule tree :math:`S`
    | Initialize the space loop candidate pool :math:`P\gets \emptyset`;
    | Extract the outermost permutable loop band :math:`d` from :math:`s`;
    | for each loop :math:`l` in the band :math:`d` do
    |  if all flow/read dependence distances on loop :math:`l \leq 1` then
    |    :math:`P \gets P \cup l`;
    | /* Generate 2D systolic array. \*/
    | for each pair of loops :math:`(l_1, l_2)` in the pool :math:`P` do
    |  Duplicate the schedule tree :math:`s' \gets s`;
    |  Modify :math:`s` by permuting the loops :math:`l_1, l_2` to outermost;
    |  :math:`S\gets S\cup s'`;
    | /* Generate 1D systolic array (omitted), similar to 2D case with only one space loop selected. \*/

AutoSA searches for the loops in the outermost loop band with flow/read dependence distances no greater then one. 
Those loops are put into a pool as the candidate space loops. 
Next, AutoSA enumerates all space loop combinations from the candidate pool. 
The selected space loops are permuted outermost. 
All the loops below the space loops are assigned as time loops. 
At present, AutoSA generates 1D and 2D systolic arrays. 
This constraint can be relaxed to generate higher-dimensional arrays 
if necessary. 
There will be multiple systolic arrays generated from this step, 
each with a unique schedule. 
Users can choose which array to process manually, 
or leave it to be explored by the auto-tuner.

Array Partitioning
^^^^^^^^^^^^^^^^^^

Given the limited on-chip resource, array partitioning is mandatory when mapping a large array to FPGA.  
To achieve this, AutoSA tiles the outermost permutable loop band in the schedule 
tree which contains the space loops. 
The tiling factors can be chosen by the users or set by the auto-tuner during the 
design space exploration. 
The schedule tree below shows one example in which we tile the outermost loop band 
in the MM example with the tiling factors of :math:`(4,4,4)`. 

.. image:: images/mm_tree_array_part.png
    :width: 400
    :align: center

The point loops from the original loops :math:`i` and :math:`j` are kept as the space loops. 
This will lead to a 2D systolic array with the dimensions of :math:`4\times4`.

Latency Hiding
^^^^^^^^^^^^^^

Latency hiding helps hide the pipeline stalls caused by the loop-carried dependence 
of the compute statements. In the MM example, the multiply-and-add (MAC) operation 
in the statement S0 introduces loop-carried dependence on the loop :math:`k`, 
resulting in an initial interval (II) greater than one. 
To resolve this issue, AutoSA looks for parallel loops in the schedule tree, 
strip-mines them and permutes the point loops innermost. 
As an example, loops :math:`i` and :math:`j` are parallel loops in the MM example. 
We will strip-mine them with the tiling factors of :math:`(2,2)` and permute the point 
loops innermost. Since there is no loop-carried dependence on the innermost loop, 
the PE could now achieve II=1. 
The newly generated schedule is shown below.

.. image:: images/mm_tree_latency.png
    :width: 400
    :align: center

Similar as the previous stage, AutoSA allows users to specify the loops to be tiled 
and the tiling factors. 
Alternatively, such choices will be explored by the auto-tuner to maximize the performance.

SIMD Vectorization
^^^^^^^^^^^^^^^^^^

SIMD vectorization duplicates the compute units inside each PE, 
which still share the same control logic. 
This helps amortize the control overheads and improve the resource efficiency of the 
design. At present, AutoSA detects the vectorizable loop by 
examining the following two criteria:

* The loop should be a parallel loop or a reduction loop. 
* All array references within the loop are stride-one or stride-zero in regard to this loop. 

.. note:: 

    The current polyhedral framework that AutoSA builds on lacks the capability 
    to detect the reduction loop, which requires the user annotation prior to 
    the compilation.

In the MM example, the loop :math:`k` is a reduction loop. Array references ``C[i][j]`` and ``A[i][k]`` 
are stride-zero and stride-one with regard to loop :math:`k`. 
The array reference ``B[k][j]`` requires a layout transformation to ``B[j][k]`` so that 
it becomes a stride-one access that enables the vectorization. 
Below is the updated schedule tree in which we strip-mine the loop :math:`k` 
with a factor of 2.

.. image:: images/mm_tree_simd.png
    :width: 400
    :align: center

The point loop is permuted innermost and marked ``unroll`` which will be handled by HLS tools at last. 

Communication Management
------------------------

So far we have finished the PE construction and optimization. 
However, the current array is still not functional as we are missing the other key component, 
the I/O network. 
The I/O network is a network on chip that supports two types of data communication:

* **Inner-array communication**, the data communication between PEs.
* **Outer-array communication**, the data communication between PEs and the external memory (e.g., DRAM).

The stage of communication management in AutoSA analyzes the program and constructs 
the I/O network as mentioned above.
We show that I/O network can be built automatically via data dependence analysis 
in the polyhedral model. 
Furthermore, as the topology of the I/O network plays an important role in the 
frequency of the design, we extend the algorithm to build an I/O network that 
only involves local interconnects, 
hence, guaranteeing the sustained high frequency. 

The following subsections explain our approaches in detail. 
`I/O Analysis`_ describes how we analyze the dependences in the program to extract 
the necessary information for constructing the I/O network. 
`I/O Construction`_ builds the I/O network using the information extracted from the 
previous step. 
`I/O Optimization`_ discusses several I/O optimization techniques to further 
improve the I/O performance.

I/O Analysis
^^^^^^^^^^^^

The data communication is associated with the data dependences.
To build the I/O network, AutoSA analyzes the following three types of data dependences:

* Read dependence: for transferring the read-only data.
* Flow dependence: for transferring the intermediate results.
* Output dependence: for transferring the final results. 

The table below lists the dependences extracted from the MM example. 

.. csv-table::
    :header: "Type", "Dependence Relation", "Array Access"

    "Read", ":math:`D1:=\{S0[i,j,k]\to S0[i,j+1,k]\}`", ``A[i][k]``
    "Read", ":math:`D2:=\{S0[i,j,k]\to S0[i+1,j,k]\}`", ``B[k][j]``
    "Flow", ":math:`D3:=\{S0[i,j,k]\to S0[i,j,k+1]\}`", ``C[i][j]``
    "Output", ":math:`D4:=\{S0[i,j,k]\to S0[i,j,k+1]\}`", ``C[i][j]``

The step of I/O analysis interprets such dependences and extracts a data structure 
named *I/O group* that contains the necessary information required to construct the I/O network. 
Please refer to the `AutoSA Paper <https://vast.cs.ucla.edu/sites/default/files/publications/FPGA2021_AutoSA_camera.pdf>`_ 
for more details about how we derive the I/O groups.

An I/O group :math:`g` is defined as a tuple of :math:`g=(A,D)` where :math:`A` is a
set of array accesses that are associated with the current group and 
:math:`D` is the set of data dependences associated with the array accesses in :math:`A`. 
For each I/O group, the following properties are computed:

* **I/O direction**. This is the component of the dependence distance vector on the space loops.
* **I/O type**. The I/O group is classified as *exterior I/O* if the dependence is carried by 
  the space loops. Otherwise, it is classified as *interior I/O*.

As an example, in the MM example, for the array access ``B[k][j]``, 
we construct an I/O group :math:`g` from the array access ``B[k][j]`` and 
its associated dependence :math:`D2` as shown in the table above.
The dependence distance of :math:`D2` on the space loops is :math:`(1,0)`. 
Therefore, we assign the I/O direction as :math:`g.dir=(1,0)` and the 
I/O type as :math:`g.type=exterior`.

The I/O groups are then merged if they share the same properties.
Later, AutoSA will allocate a set of I/O modules for each I/O group.

The last step is to compute the statement instances that require such data.
We divide them into two sets: copy-in set :math:`W_{in}` and 
copy-out set :math:`W_{out}`. 
These sets contain the statement instances that require the data to be copied in 
or copied out, respectively.

The table below includes the final I/O groups extracted from the MM example and 
their copy-in/copy-out sets. 
They will be used for I/O network construction in the next section.

.. csv-table::
    :header: "No.", ":math:`A`", ":math:`D`", ":math:`W_{in}/W_{out}`"

    ":math:`g_1`", ``A[i][k]``, ":math:`D_1`", ":math:`W_{in}:={S0[i,j,k]}:0\leq i< M \land 0\leq j< N \land 0\leq k<K\}`"
    ":math:`g_2`", ``B[k][j]``, ":math:`D_2`", ":math:`W_{in}:={S0[i,j,k]}:0\leq i< M \land 0\leq j< N \land 0\leq k<K\}`"
    ":math:`g_3`", ``C[i][j]``, ":math:`D_3`", ":math:`W_{in}:={S0[i,j,k]}:0\leq i< M \land 0\leq j< N \land 0< k<K\}`"
    , , , ":math:`W_{out}:={S0[i,j,k]}:0\leq i< M \land 0\leq j< N \land 0\leq k<K-1\}`"
    ":math:`g_4`", ``C[i][j]``, ":math:`D_4`", ":math:`W_{out}:={S0[i,j,k=K-1]:0\leq i< M \land 0\leq j< N}`"

I/O Construction    
^^^^^^^^^^^^^^^^

This step constructs the I/O modules based on the I/O grouping information extracted 
from the previous step. 
For each I/O group, AutoSA allocates a set of I/O modules for transferring the 
data between PEs and the external memory.

We start with the optimized schedule from the computation management. 
In the first step, we isolate the statement instances that are involved with 
the data communication from the current group by inserting a filter node into 
the schedule tree with the copy-in/copy-out set. 
The filter node restrains the iteration domains of its children nodes by intersecting the current iteration domain with the filter set.

As an example, below is the updated schedule with the filter domain of the I/O group
:math:`g_2` (loops inside the space loops are omitted for brevity).

.. image:: images/mm_tree_isolate.png
    :width: 500
    :align: center

At this stage, we could already generate a set of I/O modules that load the data from the external memory and send the data directly to each PE.
This can be realized by equating the space loops to the PE indices ``idx`` and ``idy`` in the updated schedule and using it to generate the code inside each I/O module.
The figure below shows the generated array and the corresponding schedule for each I/O module.

.. image:: images/mm_array_b.png
    :width: 500
    :align: center

However, this architecture may not be scalable as the data are scattered directly from the external memory which causes high fan-outs and could lead to routing failure.
To resolve this issue, we choose to *localize* the I/O network by using a daisy-chain architecture.
In this architecture, each I/O module fetches data from the upper-stream I/O modules.
The I/O module works as a filter that keeps the data belonging to the PEs that it is associated with and passes the rest of the data to the down-stream I/O modules.
As for the architecture in the figure above, 
we name the I/O modules that are directly connected to PEs as level-one (L1) I/O modules.
We could first cluster the L1 I/O modules along the :math:`x`-axis, as shown in the figure below.

.. image:: images/mm_array_L1.png
    :width: 400
    :align: center

Every two L1 modules along the :math:`x`-axis are connected to an upper-level (L2) I/O modules, which helps to reduce the memory fan-outs from four to two.
We name such a process as *I/O clustering*.
I/O clustering can be applied multiple times in a hierarchical way.
For example, we could apply the I/O clustering again on the L2 I/O modules, generating one L3 I/O module that connects to the DRAM, as shown in the figure below.
Eventually, we reduce the memory fan-outs from four to one.

.. image:: images/mm_array_L2.png
    :width: 250
    :align: center

The figure below depicts the final array architecture after the I/O clustering for all the I/O groups.

.. image:: images/mm_array_unopt.png
    :width: 400
    :align: center

I/O Optimization
^^^^^^^^^^^^^^^^

In this step, AutoSA applies multiple passes to further optimize the I/O network. 

**I/O module embedding**: L1 I/O modules with exterior I/O are embedded into the PEs to save the resource.

**I/O module pruning**: When transferring the data between different sub-array tiles, 
AutoSA checks if the copy-out set of the previous tile equals the copy-in set of the 
current tile at the PE level. If two sets are equal at the PE level, 
it indicates the data are located on-chip and hence the data transfer from the external 
memory is unnecessary. For such a case, the I/O modules for this I/O group are pruned 
away to save the off-chip communication and on-chip resource. 
As an example, for the MM example, the I/O modules for the group :math:`g_3` 
will be pruned away since the data of matrix C are accumulated locally inside each PE. 
The figure below shows the optimized array by applying two techniques as mentioned above.

.. image:: images/mm_array_opt.png
    :width: 300
    :align: center

**Data packing**: To reduce the data transfer latency between the I/O modules, 
AutoSA performs data packing between I/O modules. 
Packing more data helps reduce the data transfer latency, 
however, it leads to FIFOs with a larger width and higher resource usage. 
Therefore, AutoSA offers options to set the data packing factor at each I/O level, 
which can also be set by the auto-tuner during the design space exploration.

**Double buffering**: By default, AutoSA allocates a local buffer inside the L1 I/O modules 
for I/O groups with interior I/O or inside the L2 I/O modules for I/O groups 
with the exterior I/O. For such I/O modules with local buffers inside, 
AutoSA offers options to enable the double buffering that helps overlap the 
memory transfer with the PE computation.

After the above steps, we obtain a complete systolic array with both PEs and I/O network.

================================================
FILE: docs/tutorials/structural_sparsity.rst
================================================
Supporting Structural Sparsity
==============================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

Structural sparsity can be useful for DNN networks. This page discusses how structural 
sparsity is supported in AutoSA.

What is Strctural Sparsity?
---------------------------

AutoSA supports the similar structural sparsity that can be found in the recent Nvidia 
Ampere GPU (`link <https://developer.nvidia.com/blog/exploiting-ampere-structured-sparsity-with-cusparselt/>`_). 
The figure below shows the supported sparse matrix-dense matrix multiplication.

.. image:: images/sparse_mm.png
    :align: center

The figure above performs the computation of :math:`C=A\times B`.
The first input matrix A is a strutural-sparse matrix, and the second input matrix B is 
a dense matrix.
As for the matrix A, every adjacent ``VEC_LEN`` elemens are grouped together. In every group,
we allow up to ``NON_ZERO_NUM`` non-zero elements. Therefore, the sparsity of the matrix A is
``1-NUN_ZERO_NUM/VEC_LEN``.

The sparse matrix A is then stored in a compressed format, in which only the non-zero elements 
are stored, along with their relative indices inside each group.

The benefits of structural sparsity are clear. It allows the hardware to achieve higher 
effective throughput with the same amount of resource. 
It is also easy to be implemented on the systolic array architecture. 
We will show how to modify the systolic array to support the structural sparsity in the next section.

How is Structural Sparsity Implemented in AutoSA?
-------------------------------------------------

As a comparison, we first present how the dense matrix multiplication is mapped to the 
systolic array.

.. image:: images/dense_array.png
    :width: 500
    :align: center

In the figure above, we show an example of a 2D :math:`2\times 2` systolic array.
Each PE computes a sub tile of the matrix C with the size :math:`4\times 2`.
With SIMD vectorization, each time, two vectors of 4 elements from the matrix A and 
matrix B are loaded into the PE. The PE computes the inner product of the two vectors 
and updates the elements of matrix C.

This array can be easily extended to support the structural sparsity.
The figure below shows an example in which we set the vector size :math:`v` as 4 and 
the number of non-zero elements :math:`NON_ZERO_NUM` as 2.

.. image:: images/sparse_array.png
    :align: center

As the new matrix A is sparse, instead of packing 4 elements and send to PE each time, 
we will only pack 2 elements, along with their indices in the original group vector, and send them 
to PEs. When PE loads the packed data, it will use the indices of the A elements to select 
the corresponding elments from the vector of matrix B. 
Compared to the dense architecture, we introduce the packed indices of the sparse data and a MUX 
for selecting the data from matrix B.

Generating the Design
---------------------

Now let's use AutoSA to generate one sparse design.
The example used here can be found in the directory ``${AUTOSA_ROOT}/autosa_tests/mm_block_sparse``.

Use the following command to generate the design.

.. code:: bash

    ./autosa ./autosa_tests/mm_block_sparse/kernel.c \
    --config=./autosa_config/autosa_config.json \
    --target=autosa_hls_c \
    --output-dir=./autosa.tmp/output \
    --sa-sizes="{kernel[]->space_time[3];kernel[]->array_part[16,16,16];kernel[]->latency[8,8];kernel[]->simd[8]}" \
    --simd-info=./autosa_tests/mm_block_sparse/simd_info.json \
    --host-serialize \
    --hls \
    --block-sparse --block-sparse-ratio="{kernel[]->A[2,4]}"

The generated designs can be found at the directory ``${AUTOSA_ROOT}/autosa.tmp/output/src``

This command generates a design in Xilinx HLS C. You can use Xilinx HLS to verify the correctness of the design.

Copy the TCL script to the output directory.

.. code:: bash

    cp ${AUTOSA_ROOT}/autosa_tests/mm_block_sparse/hls_script.tcl ${AUTOSA_ROOT}/autosa.tmp/output/

Run the TCL script to verify the design.

.. code:: bash

    cd ${AUTOSA_ROOT}/autosa.tmp/output
    vivado_hls -f hls_script.tcl

You should be able to see the following content in the terminal if the HLS design is executed successfully.

.. code:: bash

    INFO: [SIM 211-2] *************** CSIM start ***************
    INFO: [SIM 211-4] CSIM will launch GCC as the compiler.
    make: 'csim.exe' is up to date.
    Passed!
    INFO: [SIM 211-1] CSim done with 0 errors.
    INFO: [SIM 211-3] *************** CSIM finish ***************

Now let's take a closer look at the design code.
The input code can be found at ``${AUTOSA_ROOT}/autosa_tests/mm_block_sparse/kernel.c``

At line 28, we define the original matrices used for the matrix multiplication.

.. code:: c

    data_t A[I][K], B[J][K], C[I][J], C_golden[I][J];

In this example, matrix A will be sparsified. 
The figure below illustrates how we store the sparse information.

.. image:: images/sparse_example1.png    
    :align: center
    
In this figure, we set the vector length ``VEC_LEN`` as 4, and 
number of non-zero elements ``NON_ZERO_NUM`` as 2.
Array ``A_d`` stores the non-zero data elements. 
And the relative index of the data elements in each group in stored in the array ``A_i``.
The data and index array is concatenated to be stored in the array ``A_s``.
For each group vector, we store the index information using an ``unsigned char`` right 
after the data elements. Currently we assume that the group vector length to be a 
power of two and is no greater than 8. Besides, the data width of the matrices is 
no shorter than 8. All of these limitations can be relaxed in the future. 

After concatenating the index with the data elements, we will also pad empty elements to align the array.
Specifically, we compute the number of elements, except the data elements, denoted by 
``META_DATA_NUM`` using the following formula:

.. math::
    
    META\_DATA\_NUM = 2^{ceil(log2(NON\_ZERO\_NUM+1))} - NON\_ZERO\_NUM

In this example, we compute ``META_DATA_NUM`` as 2. Two additional data elements are inserted after 
the original data elements, And we store the index in the third element, as shown in the figure above.

Another example is shown in the figure below.

.. image:: images/sparse_example2.png    
    :align: center

In this example, we have ``VEC_LEN`` as 4, ``NON_ZERO_NUM`` as 1, and ``META_DATA_NUM`` as 1.

For compilation, we still use the original dense matrix multiplication, as shown in lines 89-97.
We provide the sparse information to the compiler through command arguments:

* ``--block-sparse``: Specifies to use block sparsity.
* ``--block-sparse-ratio="{kernel[]->A[2,4]}"``: Specifies the sparse array as array ``A``, and the 
  number of non-zero elements and the group vector length ``[NON_ZERO_ELEMENTS, VEC_LEN]``.

================================================
FILE: docs/tutorials/theory_background.rst
================================================
.. _theoretical-background-label:

Theoretical Background
======================

**Author**: Jie Wang (jiewang@cs.ucla.edu)

This page covers the theoretocal background of mapping algorithms to systolic arrays. 
We will start by giving an example of a systolic array for matrix multiplication to show
how systolic arrays look like and how they work. Then we will cover some basics about the 
polyhedral model and the algorithm (i.e., space-time transformation) that AutoSA uses to map
an algorithm to a systolic array.

An Example of Systolic Array
----------------------------

The example code below describes the matrix multiplication :math:`C=A\times B`.

.. code:: c

  int A[3][3], B[3][3], C[3][3];
  for (int i = 0; i < 3; i++)
    for (int j = 0; j < 3; j++) {
  S0: C[i][j] = 0;
      for (int k = 0; k < 3; k++) 
  S1:   C[i][j] += A[i][k] * B[k][j];
    }
      
This algorithm can be mapped to a systolic array depicted in the figure below.

.. image:: images/2d_array_mm.png
    :width: 200
    :align: center

In this figure, a 3x3 2D systolic array is generated for this algorithm.
The processing elements (PE) are connected only through local interconnects, the most 
important signature of systolic array architecture. 
The figure below futher depicts the detailed computation scheduling of this array.

.. image:: images/2d_array_mm_schedule.png
    :width: 500
    :align: center

Specifically, each PE computes one element of matrix C locally. Data of matrix A and B 
are fed at the boundaries and reused across PEs. The timing to feed data to different rows and columns of PEs
are skewed to match the computation scheduling inside PEs.
To explain further, at the first cycle (when t = 0), the data of ``A[0][0]`` and ``B[0][0]`` are 
fed to the PE on the top-left corner. At the next cycle (t = 1), ``A[0][0]`` is sent downward and 
``B[0][0]`` is sent rightward. In the meantime, new data ``A[0][1]`` and ``B[1][0]`` are sent to the original PE
and we also start to feed boundary PEs in the second column and row. 

After the computation is completed, each PE contains the final result of matrix C. The final results
will be drained out to the external memory at last.

As shown in the figure above, at each cycle, data are pumped into arrays and transferred across PEs rhythmically. 
This is how the name **systolic array** is coined for this type of architecture.

There are two major benefits of such an architecture.

* *Performance*. Systolic array exploits parallelism with a large number of PEs to achieve high performance.
* *Energy efficiency*. The local interconnects maximize data reuse and reduces the energy cost of data transfer, thus leading to high energy efficiency.

Due to such benefits, in the recent years, we have seen systolic arrays being widely adopted in various application domains, e.g., genomics, machine learning, 
to accelerate the computation.

Polyheral Model
---------------

The polyhedral model is a mathematical framework for loop nest optimization. 
Loop nests that satisfy the requirements of the polyhedral model are called 
static control of parts (SCoP). A SCoP is defined as a set of statements with loop bounds
and conditions as affine functions of the enclosing loop iterators and variables that are
constant during the SCoP execution.

A program in the polyhedral model is typically represented by three components: 
*iteration domains*, *access relations*, and a *schedule*. We will keep use the running example of 
matrix multiplication in the previous section to illustrate these concepts.

The iteration domain contains the loop instances of the program. The iteration domain of the statement
S1 in the example program has the form

.. math::

    \{S1[i,j,k]:0\leq i< 3 \land 0\leq j< 3 \land 0\leq k<3\}

Throughout this tutorial, to represent the components of the polyhedral model, we use the same
format as `integer set library (ISL) <http://isl.gforge.inria.fr/>`_, which is a library
for polyhedral compilation. In addition, we will only show the representation with regard to the statement
S1 for brevity.

The access relation maps a statement instance to an array index. For example, 
the access relations for the read accesses in the statement S1 have the form

.. math::

    \{S1[i,j,k]\to A[i,k];S1[i,j,k]\to B[k,j];S1[i,j,k]\to C[i,j]\}

Finally, a schedule maps instance sets to multi-dimensional time. 
The statement instances are executed following the lexicographic 
order of the multi-dimensional time. 
As an example, the schedule of the statement S1 has the form 

.. math::

    \{S1[i,j,k]\to [i,j,k]\} 
    
The schedule of a SCoP program can be represented by 
`schedule trees <http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf>`_.
The figure below shows the schedule tree of the example program. 

.. image:: images/mm_tree.png
    :width: 400
    :align: center
    
The schedule tree starts with a domain node that defines the iteration domain of 
the program, followed with band nodes that encode the partial schedules at each 
loop dimension. 
The isl library manipulates the schedule tree of the program to perform the loop transformation. To generate the final code, an AST is obtained from the schedule tree which is then lowered to the target code (e.g., C).

For readers who are intereted to learn more about the polyhedral model, we recommend some resources below.

* `ISL manual <http://isl.gforge.inria.fr//manual.pdf>`_, the manual contains all the basic concepts and APIs of ISL.
* `ISCC online demonstrator <https://polyhedral.info/2014/01/21/ISCC-demo-online.html>`_, an interactive interface to most of ISl functionalities. Don't forget to check out `this tutorial <http://barvinok.gforge.inria.fr/tutorial.pdf>`_ before using ISCC.
* `Pluto framework <http://pluto-compiler.sourceforge.net/>`_, a milestone framework to get familar with the polyhedral scheduling algorithms.
* `PPCG <https://github.com/Meinersbur/ppcg>`_, a polyhedral-model-based C-to-CUDA compiler. The original paper is `here <https://dl.acm.org/doi/pdf/10.1145/2400682.2400713>`_.
* Some recent polyhedral-model-based compilation frameworks

    * `Tensor Comprehension <https://research.fb.com/downloads/tensor-comprehensions/>`_ (discontinued)
    * `Tiramisu <http://tiramisu-compiler.org/#:~:text=Tiramisu%20is%20a%20polyhedral%20compiler,be%20optimized%20by%20the%20compiler.>`_

Space-Time transformation
-------------------------

In the last section of this tutorial, we will touch another important topic that lays the foundation of AutoSA, 
the space-time transformation.
The space-time transformation applies loop transformations on the target program and assigns new semantics
*space* and *time* to the generated loops. Space loops map loop instances to different PEs that execute concurrently, while time loops describe the computation inside each PE. 

To generate a legal systolic array, the following constraints should be satisfied by the loop transformation: 

* First, the transformation should be semantics-preserving. 
* Second, all dependences should be uniform (with constant dependence distance). 
* Third, the dependence distances on space loops should be no greater than one so that the data communication only happens between neighbor PEs. 

Note that for the first and second constraints, we consider all types of dependences (flow, anti, output and input/read dependences). 
We take into account the read dependences since the data transfer needs to be managed explicitly in systolic arrays including the read-only data. 
As for the third constraint, we only examine the flow and read dependences which are associated with the inter-PE communication. 
Since each PE has its own address space, anti and output dependences do not contribute to the data communication between PEs.

For the matrix multiplication example, we obtain one flow dependence (domain constraints and the statement S0 omitted for brevity) as 

.. math::

    D1 := \{S1[i,j,k]\to S1[i,j,k+1]\} 
    
and two read dependences for array references ``A[i][k]`` and ``B[k][j]`` as 

.. math::

    D2 := \{S1[i,j,k]\to S1[i,j+1,k]\} 
    D3 := \{S1[i,j,k]\to S1[i+1,j,k]\}
    
One possible space-time transformation is 

.. math::

    S := \{S1[i,j,k]\to[i,j,k]\}
    
which is an identity mapping that keeps the original loop. 
We could calculate the dependence distances for the above-mentioned three dependences 
:math:`D1`, :math:`D2`, and :math:`D3` under the schedule :math:`S`, which are :math:`(0,0,1)`, :math:`(0,1,0)`, 
and :math:`(1,0,0)`. 
All dependences are uniform (we omit the discussion about output and anti dependences for brevity). 
Besides, dependence distances on all three loops are no greater than one. 
Therefore, all three loops are eligible to be selected as the space loops. 
As an example, we select the first two loops :math:`i` and :math:`j` as 
space loops and leave the loop :math:`k` as the time loop. 
The transformed code after space-time transformation is shown below.

.. image::  images/mm_st_code.png
    :width: 400
    :align: center

This transformation leads to the 2D systolic array as shown in `An Example of Systolic Array`_


================================================
FILE: install.sh
================================================
#!/bin/sh
# Initialize ISL and PET
git submodule init
git submodule update
(cd src/isl; git submodule init imath; git submodule update imath)
(cd src/barvinok; ./get_submodules.sh)

# Install python packages
pip3 install -r requirements.txt

# Patch ISL
echo "Patch ISL"
(cd ./autosa_scripts/ppcg_changes/isl; ./isl_patch.sh)

# Compilation
(cd src; echo "autogen"; ./autogen.sh; echo "configure"; ./configure; echo "make"; make -j4)

# Cleanup 
cp ./autosa_scripts/autosa.py ./autosa
(mkdir autosa.tmp; cd autosa.tmp; mkdir output optimizer; cd output; mkdir src latency_est resource_est tuning)


================================================
FILE: ltmain.sh
================================================
#! /bin/sh
## DO NOT EDIT - This file generated from ./build-aux/ltmain.in
##               by inline-source v2014-01-03.01

# libtool (GNU libtool) 2.4.6
# Provide generalized library-building support services.
# Written by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996

# Copyright (C) 1996-2015 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions.  There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

# GNU Libtool is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# As a special exception to the GNU General Public License,
# if you distribute this file as part of a program or library that
# is built using GNU Libtool, you may include this file under the
# same distribution terms that you use for the rest of that program.
#
# GNU Libtool is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


PROGRAM=libtool
PACKAGE=libtool
VERSION="2.4.6 Debian-2.4.6-0.1"
package_revision=2.4.6


## ------ ##
## Usage. ##
## ------ ##

# Run './libtool --help' for help with using this script from the
# command line.


## ------------------------------- ##
## User overridable command paths. ##
## ------------------------------- ##

# After configure completes, it has a better idea of some of the
# shell tools we need than the defaults used by the functions shared
# with bootstrap, so set those here where they can still be over-
# ridden by the user, but otherwise take precedence.

: ${AUTOCONF="autoconf"}
: ${AUTOMAKE="automake"}


## -------------------------- ##
## Source external libraries. ##
## -------------------------- ##

# Much of our low-level functionality needs to be sourced from external
# libraries, which are installed to $pkgauxdir.

# Set a version string for this script.
scriptversion=2015-01-20.17; # UTC

# General shell script boiler plate, and helper functions.
# Written by Gary V. Vaughan, 2004

# Copyright (C) 2004-2015 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions.  There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.

# As a special exception to the GNU General Public License, if you distribute
# this file as part of a program or library that is built using GNU Libtool,
# you may include this file under the same distribution terms that you use
# for the rest of that program.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNES FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Please report bugs or propose patches to gary@gnu.org.


## ------ ##
## Usage. ##
## ------ ##

# Evaluate this file near the top of your script to gain access to
# the functions and variables defined here:
#
#   . `echo "$0" | ${SED-sed} 's|[^/]*$||'`/build-aux/funclib.sh
#
# If you need to override any of the default environment variable
# settings, do that before evaluating this file.


## -------------------- ##
## Shell normalisation. ##
## -------------------- ##

# Some shells need a little help to be as Bourne compatible as possible.
# Before doing anything else, make sure all that help has been provided!

DUALCASE=1; export DUALCASE # for MKS sh
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
  emulate sh
  NULLCMD=:
  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
  # is contrary to our usage.  Disable this feature.
  alias -g '${1+"$@"}'='"$@"'
  setopt NO_GLOB_SUBST
else
  case `(set -o) 2>/dev/null` in *posix*) set -o posix ;; esac
fi

# NLS nuisances: We save the old values in case they are required later.
_G_user_locale=
_G_safe_locale=
for _G_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
do
  eval "if test set = \"\${$_G_var+set}\"; then
          save_$_G_var=\$$_G_var
          $_G_var=C
	  export $_G_var
	  _G_user_locale=\"$_G_var=\\\$save_\$_G_var; \$_G_user_locale\"
	  _G_safe_locale=\"$_G_var=C; \$_G_safe_locale\"
	fi"
done

# CDPATH.
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH

# Make sure IFS has a sensible default
sp=' '
nl='
'
IFS="$sp	$nl"

# There are apparently some retarded systems that use ';' as a PATH separator!
if test "${PATH_SEPARATOR+set}" != set; then
  PATH_SEPARATOR=:
  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
      PATH_SEPARATOR=';'
  }
fi


## ------------------------- ##
## Locate command utilities. ##
## ------------------------- ##


# func_executable_p FILE
# ----------------------
# Check that FILE is an executable regular file.
func_executable_p ()
{
    test -f "$1" && test -x "$1"
}


# func_path_progs PROGS_LIST CHECK_FUNC [PATH]
# --------------------------------------------
# Search for either a program that responds to --version with output
# containing "GNU", or else returned by CHECK_FUNC otherwise, by
# trying all the directories in PATH with each of the elements of
# PROGS_LIST.
#
# CHECK_FUNC should accept the path to a candidate program, and
# set $func_check_prog_result if it truncates its output less than
# $_G_path_prog_max characters.
func_path_progs ()
{
    _G_progs_list=$1
    _G_check_func=$2
    _G_PATH=${3-"$PATH"}

    _G_path_prog_max=0
    _G_path_prog_found=false
    _G_save_IFS=$IFS; IFS=${PATH_SEPARATOR-:}
    for _G_dir in $_G_PATH; do
      IFS=$_G_save_IFS
      test -z "$_G_dir" && _G_dir=.
      for _G_prog_name in $_G_progs_list; do
        for _exeext in '' .EXE; do
          _G_path_prog=$_G_dir/$_G_prog_name$_exeext
          func_executable_p "$_G_path_prog" || continue
          case `"$_G_path_prog" --version 2>&1` in
            *GNU*) func_path_progs_result=$_G_path_prog _G_path_prog_found=: ;;
            *)     $_G_check_func $_G_path_prog
		   func_path_progs_result=$func_check_prog_result
		   ;;
          esac
          $_G_path_prog_found && break 3
        done
      done
    done
    IFS=$_G_save_IFS
    test -z "$func_path_progs_result" && {
      echo "no acceptable sed could be found in \$PATH" >&2
      exit 1
    }
}


# We want to be able to use the functions in this file before configure
# has figured out where the best binaries are kept, which means we have
# to search for them ourselves - except when the results are already set
# where we skip the searches.

# Unless the user overrides by setting SED, search the path for either GNU
# sed, or the sed that truncates its output the least.
test -z "$SED" && {
  _G_sed_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/
  for _G_i in 1 2 3 4 5 6 7; do
    _G_sed_script=$_G_sed_script$nl$_G_sed_script
  done
  echo "$_G_sed_script" 2>/dev/null | sed 99q >conftest.sed
  _G_sed_script=

  func_check_prog_sed ()
  {
    _G_path_prog=$1

    _G_count=0
    printf 0123456789 >conftest.in
    while :
    do
      cat conftest.in conftest.in >conftest.tmp
      mv conftest.tmp conftest.in
      cp conftest.in conftest.nl
      echo '' >> conftest.nl
      "$_G_path_prog" -f conftest.sed <conftest.nl >conftest.out 2>/dev/null || break
      diff conftest.out conftest.nl >/dev/null 2>&1 || break
      _G_count=`expr $_G_count + 1`
      if test "$_G_count" -gt "$_G_path_prog_max"; then
        # Best one so far, save it but keep looking for a better one
        func_check_prog_result=$_G_path_prog
        _G_path_prog_max=$_G_count
      fi
      # 10*(2^10) chars as input seems more than enough
      test 10 -lt "$_G_count" && break
    done
    rm -f conftest.in conftest.tmp conftest.nl conftest.out
  }

  func_path_progs "sed gsed" func_check_prog_sed $PATH:/usr/xpg4/bin
  rm -f conftest.sed
  SED=$func_path_progs_result
}


# Unless the user overrides by setting GREP, search the path for either GNU
# grep, or the grep that truncates its output the least.
test -z "$GREP" && {
  func_check_prog_grep ()
  {
    _G_path_prog=$1

    _G_count=0
    _G_path_prog_max=0
    printf 0123456789 >conftest.in
    while :
    do
      cat conftest.in conftest.in >conftest.tmp
      mv conftest.tmp conftest.in
      cp conftest.in conftest.nl
      echo 'GREP' >> conftest.nl
      "$_G_path_prog" -e 'GREP$' -e '-(cannot match)-' <conftest.nl >conftest.out 2>/dev/null || break
      diff conftest.out conftest.nl >/dev/null 2>&1 || break
      _G_count=`expr $_G_count + 1`
      if test "$_G_count" -gt "$_G_path_prog_max"; then
        # Best one so far, save it but keep looking for a better one
        func_check_prog_result=$_G_path_prog
        _G_path_prog_max=$_G_count
      fi
      # 10*(2^10) chars as input seems more than enough
      test 10 -lt "$_G_count" && break
    done
    rm -f conftest.in conftest.tmp conftest.nl conftest.out
  }

  func_path_progs "grep ggrep" func_check_prog_grep $PATH:/usr/xpg4/bin
  GREP=$func_path_progs_result
}


## ------------------------------- ##
## User overridable command paths. ##
## ------------------------------- ##

# All uppercase variable names are used for environment variables.  These
# variables can be overridden by the user before calling a script that
# uses them if a suitable command of that name is not already available
# in the command search PATH.

: ${CP="cp -f"}
: ${ECHO="printf %s\n"}
: ${EGREP="$GREP -E"}
: ${FGREP="$GREP -F"}
: ${LN_S="ln -s"}
: ${MAKE="make"}
: ${MKDIR="mkdir"}
: ${MV="mv -f"}
: ${RM="rm -f"}
: ${SHELL="${CONFIG_SHELL-/bin/sh}"}


## -------------------- ##
## Useful sed snippets. ##
## -------------------- ##

sed_dirname='s|/[^/]*$||'
sed_basename='s|^.*/||'

# Sed substitution that helps us do robust quoting.  It backslashifies
# metacharacters that are still active within double-quoted strings.
sed_quote_subst='s|\([`"$\\]\)|\\\1|g'

# Same as above, but do not quote variable references.
sed_double_quote_subst='s/\(["`\\]\)/\\\1/g'

# Sed substitution that turns a string into a regex matching for the
# string literally.
sed_make_literal_regex='s|[].[^$\\*\/]|\\&|g'

# Sed substitution that converts a w32 file name or path
# that contains forward slashes, into one that contains
# (escaped) backslashes.  A very naive implementation.
sed_naive_backslashify='s|\\\\*|\\|g;s|/|\\|g;s|\\|\\\\|g'

# Re-'\' parameter expansions in output of sed_double_quote_subst that
# were '\'-ed in input to the same.  If an odd number of '\' preceded a
# '$' in input to sed_double_quote_subst, that '$' was protected from
# expansion.  Since each input '\' is now two '\'s, look for any number
# of runs of four '\'s followed by two '\'s and then a '$'.  '\' that '$'.
_G_bs='\\'
_G_bs2='\\\\'
_G_bs4='\\\\\\\\'
_G_dollar='\$'
sed_double_backslash="\
  s/$_G_bs4/&\\
/g
  s/^$_G_bs2$_G_dollar/$_G_bs&/
  s/\\([^$_G_bs]\\)$_G_bs2$_G_dollar/\\1$_G_bs2$_G_bs$_G_dollar/g
  s/\n//g"


## ----------------- ##
## Global variables. ##
## ----------------- ##

# Except for the global variables explicitly listed below, the following
# functions in the '^func_' namespace, and the '^require_' namespace
# variables initialised in the 'Resource management' section, sourcing
# this file will not pollute your global namespace with anything
# else. There's no portable way to scope variables in Bourne shell
# though, so actually running these functions will sometimes place
# results into a variable named after the function, and often use
# temporary variables in the '^_G_' namespace. If you are careful to
# avoid using those namespaces casually in your sourcing script, things
# should continue to work as you expect. And, of course, you can freely
# overwrite any of the functions or variables defined here before
# calling anything to customize them.

EXIT_SUCCESS=0
EXIT_FAILURE=1
EXIT_MISMATCH=63  # $? = 63 is used to indicate version mismatch to missing.
EXIT_SKIP=77	  # $? = 77 is used to indicate a skipped test to automake.

# Allow overriding, eg assuming that you follow the convention of
# putting '$debug_cmd' at the start of all your functions, you can get
# bash to show function call trace with:
#
#    debug_cmd='eval echo "${FUNCNAME[0]} $*" >&2' bash your-script-name
debug_cmd=${debug_cmd-":"}
exit_cmd=:

# By convention, finish your script with:
#
#    exit $exit_status
#
# so that you can set exit_status to non-zero if you want to indicate
# something went wrong during execution without actually bailing out at
# the point of failure.
exit_status=$EXIT_SUCCESS

# Work around backward compatibility issue on IRIX 6.5. On IRIX 6.4+, sh
# is ksh but when the shell is invoked as "sh" and the current value of
# the _XPG environment variable is not equal to 1 (one), the special
# positional parameter $0, within a function call, is the name of the
# function.
progpath=$0

# The name of this program.
progname=`$ECHO "$progpath" |$SED "$sed_basename"`

# Make sure we have an absolute progpath for reexecution:
case $progpath in
  [\\/]*|[A-Za-z]:\\*) ;;
  *[\\/]*)
     progdir=`$ECHO "$progpath" |$SED "$sed_dirname"`
     progdir=`cd "$progdir" && pwd`
     progpath=$progdir/$progname
     ;;
  *)
     _G_IFS=$IFS
     IFS=${PATH_SEPARATOR-:}
     for progdir in $PATH; do
       IFS=$_G_IFS
       test -x "$progdir/$progname" && break
     done
     IFS=$_G_IFS
     test -n "$progdir" || progdir=`pwd`
     progpath=$progdir/$progname
     ;;
esac


## ----------------- ##
## Standard options. ##
## ----------------- ##

# The following options affect the operation of the functions defined
# below, and should be set appropriately depending on run-time para-
# meters passed on the command line.

opt_dry_run=false
opt_quiet=false
opt_verbose=false

# Categories 'all' and 'none' are always available.  Append any others
# you will pass as the first argument to func_warning from your own
# code.
warning_categories=

# By default, display warnings according to 'opt_warning_types'.  Set
# 'warning_func'  to ':' to elide all warnings, or func_fatal_error to
# treat the next displayed warning as a fatal error.
warning_func=func_warn_and_continue

# Set to 'all' to display all warnings, 'none' to suppress all
# warnings, or a space delimited list of some subset of
# 'warning_categories' to display only the listed warnings.
opt_warning_types=all


## -------------------- ##
## Resource management. ##
## -------------------- ##

# This section contains definitions for functions that each ensure a
# particular resource (a file, or a non-empty configuration variable for
# example) is available, and if appropriate to extract default values
# from pertinent package files. Call them using their associated
# 'require_*' variable to ensure that they are executed, at most, once.
#
# It's entirely deliberate that calling these functions can set
# variables that don't obey the namespace limitations obeyed by the rest
# of this file, in order that that they be as useful as possible to
# callers.


# require_term_colors
# -------------------
# Allow display of bold text on terminals that support it.
require_term_colors=func_require_term_colors
func_require_term_colors ()
{
    $debug_cmd

    test -t 1 && {
      # COLORTERM and USE_ANSI_COLORS environment variables take
      # precedence, because most terminfo databases neglect to describe
      # whether color sequences are supported.
      test -n "${COLORTERM+set}" && : ${USE_ANSI_COLORS="1"}

      if test 1 = "$USE_ANSI_COLORS"; then
        # Standard ANSI escape sequences
        tc_reset='[0m'
        tc_bold='[1m';   tc_standout='[7m'
        tc_red='[31m';   tc_green='[32m'
        tc_blue='[34m';  tc_cyan='[36m'
      else
        # Otherwise trust the terminfo database after all.
        test -n "`tput sgr0 2>/dev/null`" && {
          tc_reset=`tput sgr0`
          test -n "`tput bold 2>/dev/null`" && tc_bold=`tput bold`
          tc_standout=$tc_bold
          test -n "`tput smso 2>/dev/null`" && tc_standout=`tput smso`
          test -n "`tput setaf 1 2>/dev/null`" && tc_red=`tput setaf 1`
          test -n "`tput setaf 2 2>/dev/null`" && tc_green=`tput setaf 2`
          test -n "`tput setaf 4 2>/dev/null`" && tc_blue=`tput setaf 4`
          test -n "`tput setaf 5 2>/dev/null`" && tc_cyan=`tput setaf 5`
        }
      fi
    }

    require_term_colors=:
}


## ----------------- ##
## Function library. ##
## ----------------- ##

# This section contains a variety of useful functions to call in your
# scripts. Take note of the portable wrappers for features provided by
# some modern shells, which will fall back to slower equivalents on
# less featureful shells.


# func_append VAR VALUE
# ---------------------
# Append VALUE onto the existing contents of VAR.

  # We should try to minimise forks, especially on Windows where they are
  # unreasonably slow, so skip the feature probes when bash or zsh are
  # being used:
  if test set = "${BASH_VERSION+set}${ZSH_VERSION+set}"; then
    : ${_G_HAVE_ARITH_OP="yes"}
    : ${_G_HAVE_XSI_OPS="yes"}
    # The += operator was introduced in bash 3.1
    case $BASH_VERSION in
      [12].* | 3.0 | 3.0*) ;;
      *)
        : ${_G_HAVE_PLUSEQ_OP="yes"}
        ;;
    esac
  fi

  # _G_HAVE_PLUSEQ_OP
  # Can be empty, in which case the shell is probed, "yes" if += is
  # useable or anything else if it does not work.
  test -z "$_G_HAVE_PLUSEQ_OP" \
    && (eval 'x=a; x+=" b"; test "a b" = "$x"') 2>/dev/null \
    && _G_HAVE_PLUSEQ_OP=yes

if test yes = "$_G_HAVE_PLUSEQ_OP"
then
  # This is an XSI compatible shell, allowing a faster implementation...
  eval 'func_append ()
  {
    $debug_cmd

    eval "$1+=\$2"
  }'
else
  # ...otherwise fall back to using expr, which is often a shell builtin.
  func_append ()
  {
    $debug_cmd

    eval "$1=\$$1\$2"
  }
fi


# func_append_quoted VAR VALUE
# ----------------------------
# Quote VALUE and append to the end of shell variable VAR, separated
# by a space.
if test yes = "$_G_HAVE_PLUSEQ_OP"; then
  eval 'func_append_quoted ()
  {
    $debug_cmd

    func_quote_for_eval "$2"
    eval "$1+=\\ \$func_quote_for_eval_result"
  }'
else
  func_append_quoted ()
  {
    $debug_cmd

    func_quote_for_eval "$2"
    eval "$1=\$$1\\ \$func_quote_for_eval_result"
  }
fi


# func_append_uniq VAR VALUE
# --------------------------
# Append unique VALUE onto the existing contents of VAR, assuming
# entries are delimited by the first character of VALUE.  For example:
#
#   func_append_uniq options " --another-option option-argument"
#
# will only append to $options if " --another-option option-argument "
# is not already present somewhere in $options already (note spaces at
# each end implied by leading space in second argument).
func_append_uniq ()
{
    $debug_cmd

    eval _G_current_value='`$ECHO $'$1'`'
    _G_delim=`expr "$2" : '\(.\)'`

    case $_G_delim$_G_current_value$_G_delim in
      *"$2$_G_delim"*) ;;
      *) func_append "$@" ;;
    esac
}


# func_arith TERM...
# ------------------
# Set func_arith_result to the result of evaluating TERMs.
  test -z "$_G_HAVE_ARITH_OP" \
    && (eval 'test 2 = $(( 1 + 1 ))') 2>/dev/null \
    && _G_HAVE_ARITH_OP=yes

if test yes = "$_G_HAVE_ARITH_OP"; then
  eval 'func_arith ()
  {
    $debug_cmd

    func_arith_result=$(( $* ))
  }'
else
  func_arith ()
  {
    $debug_cmd

    func_arith_result=`expr "$@"`
  }
fi


# func_basename FILE
# ------------------
# Set func_basename_result to FILE with everything up to and including
# the last / stripped.
if test yes = "$_G_HAVE_XSI_OPS"; then
  # If this shell supports suffix pattern removal, then use it to avoid
  # forking. Hide the definitions single quotes in case the shell chokes
  # on unsupported syntax...
  _b='func_basename_result=${1##*/}'
  _d='case $1 in
        */*) func_dirname_result=${1%/*}$2 ;;
        *  ) func_dirname_result=$3        ;;
      esac'

else
  # ...otherwise fall back to using sed.
  _b='func_basename_result=`$ECHO "$1" |$SED "$sed_basename"`'
  _d='func_dirname_result=`$ECHO "$1"  |$SED "$sed_dirname"`
      if test "X$func_dirname_result" = "X$1"; then
        func_dirname_result=$3
      else
        func_append func_dirname_result "$2"
      fi'
fi

eval 'func_basename ()
{
    $debug_cmd

    '"$_b"'
}'


# func_dirname FILE APPEND NONDIR_REPLACEMENT
# -------------------------------------------
# Compute the dirname of FILE.  If nonempty, add APPEND to the result,
# otherwise set result to NONDIR_REPLACEMENT.
eval 'func_dirname ()
{
    $debug_cmd

    '"$_d"'
}'


# func_dirname_and_basename FILE APPEND NONDIR_REPLACEMENT
# --------------------------------------------------------
# Perform func_basename and func_dirname in a single function
# call:
#   dirname:  Compute the dirname of FILE.  If nonempty,
#             add APPEND to the result, otherwise set result
#             to NONDIR_REPLACEMENT.
#             value returned in "$func_dirname_result"
#   basename: Compute filename of FILE.
#             value retuned in "$func_basename_result"
# For efficiency, we do not delegate to the functions above but instead
# duplicate the functionality here.
eval 'func_dirname_and_basename ()
{
    $debug_cmd

    '"$_b"'
    '"$_d"'
}'


# func_echo ARG...
# ----------------
# Echo program name prefixed message.
func_echo ()
{
    $debug_cmd

    _G_message=$*

    func_echo_IFS=$IFS
    IFS=$nl
    for _G_line in $_G_message; do
      IFS=$func_echo_IFS
      $ECHO "$progname: $_G_line"
    done
    IFS=$func_echo_IFS
}


# func_echo_all ARG...
# --------------------
# Invoke $ECHO with all args, space-separated.
func_echo_all ()
{
    $ECHO "$*"
}


# func_echo_infix_1 INFIX ARG...
# ------------------------------
# Echo program name, followed by INFIX on the first line, with any
# additional lines not showing INFIX.
func_echo_infix_1 ()
{
    $debug_cmd

    $require_term_colors

    _G_infix=$1; shift
    _G_indent=$_G_infix
    _G_prefix="$progname: $_G_infix: "
    _G_message=$*

    # Strip color escape sequences before counting printable length
    for _G_tc in "$tc_reset" "$tc_bold" "$tc_standout" "$tc_red" "$tc_green" "$tc_blue" "$tc_cyan"
    do
      test -n "$_G_tc" && {
        _G_esc_tc=`$ECHO "$_G_tc" | $SED "$sed_make_literal_regex"`
        _G_indent=`$ECHO "$_G_indent" | $SED "s|$_G_esc_tc||g"`
      }
    done
    _G_indent="$progname: "`echo "$_G_indent" | $SED 's|.| |g'`"  " ## exclude from sc_prohibit_nested_quotes

    func_echo_infix_1_IFS=$IFS
    IFS=$nl
    for _G_line in $_G_message; do
      IFS=$func_echo_infix_1_IFS
      $ECHO "$_G_prefix$tc_bold$_G_line$tc_reset" >&2
      _G_prefix=$_G_indent
    done
    IFS=$func_echo_infix_1_IFS
}


# func_error ARG...
# -----------------
# Echo program name prefixed message to standard error.
func_error ()
{
    $debug_cmd

    $require_term_colors

    func_echo_infix_1 "  $tc_standout${tc_red}error$tc_reset" "$*" >&2
}


# func_fatal_error ARG...
# -----------------------
# Echo program name prefixed message to standard error, and exit.
func_fatal_error ()
{
    $debug_cmd

    func_error "$*"
    exit $EXIT_FAILURE
}


# func_grep EXPRESSION FILENAME
# -----------------------------
# Check whether EXPRESSION matches any line of FILENAME, without output.
func_grep ()
{
    $debug_cmd

    $GREP "$1" "$2" >/dev/null 2>&1
}


# func_len STRING
# ---------------
# Set func_len_result to the length of STRING. STRING may not
# start with a hyphen.
  test -z "$_G_HAVE_XSI_OPS" \
    && (eval 'x=a/b/c;
      test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \
    && _G_HAVE_XSI_OPS=yes

if test yes = "$_G_HAVE_XSI_OPS"; then
  eval 'func_len ()
  {
    $debug_cmd

    func_len_result=${#1}
  }'
else
  func_len ()
  {
    $debug_cmd

    func_len_result=`expr "$1" : ".*" 2>/dev/null || echo $max_cmd_len`
  }
fi


# func_mkdir_p DIRECTORY-PATH
# ---------------------------
# Make sure the entire path to DIRECTORY-PATH is available.
func_mkdir_p ()
{
    $debug_cmd

    _G_directory_path=$1
    _G_dir_list=

    if test -n "$_G_directory_path" && test : != "$opt_dry_run"; then

      # Protect directory names starting with '-'
      case $_G_directory_path in
        -*) _G_directory_path=./$_G_directory_path ;;
      esac

      # While some portion of DIR does not yet exist...
      while test ! -d "$_G_directory_path"; do
        # ...make a list in topmost first order.  Use a colon delimited
	# list incase some portion of path contains whitespace.
        _G_dir_list=$_G_directory_path:$_G_dir_list

        # If the last portion added has no slash in it, the list is done
        case $_G_directory_path in */*) ;; *) break ;; esac

        # ...otherwise throw away the child directory and loop
        _G_directory_path=`$ECHO "$_G_directory_path" | $SED -e "$sed_dirname"`
      done
      _G_dir_list=`$ECHO "$_G_dir_list" | $SED 's|:*$||'`

      func_mkdir_p_IFS=$IFS; IFS=:
      for _G_dir in $_G_dir_list; do
	IFS=$func_mkdir_p_IFS
        # mkdir can fail with a 'File exist' error if two processes
        # try to create one of the directories concurrently.  Don't
        # stop in that case!
        $MKDIR "$_G_dir" 2>/dev/null || :
      done
      IFS=$func_mkdir_p_IFS

      # Bail out if we (or some other process) failed to create a directory.
      test -d "$_G_directory_path" || \
        func_fatal_error "Failed to create '$1'"
    fi
}


# func_mktempdir [BASENAME]
# -------------------------
# Make a temporary directory that won't clash with other running
# libtool processes, and avoids race conditions if possible.  If
# given, BASENAME is the basename for that directory.
func_mktempdir ()
{
    $debug_cmd

    _G_template=${TMPDIR-/tmp}/${1-$progname}

    if test : = "$opt_dry_run"; then
      # Return a directory name, but don't create it in dry-run mode
      _G_tmpdir=$_G_template-$$
    else

      # If mktemp works, use that first and foremost
      _G_tmpdir=`mktemp -d "$_G_template-XXXXXXXX" 2>/dev/null`

      if test ! -d "$_G_tmpdir"; then
        # Failing that, at least try and use $RANDOM to avoid a race
        _G_tmpdir=$_G_template-${RANDOM-0}$$

        func_mktempdir_umask=`umask`
        umask 0077
        $MKDIR "$_G_tmpdir"
        umask $func_mktempdir_umask
      fi

      # If we're not in dry-run mode, bomb out on failure
      test -d "$_G_tmpdir" || \
        func_fatal_error "cannot create temporary directory '$_G_tmpdir'"
    fi

    $ECHO "$_G_tmpdir"
}


# func_normal_abspath PATH
# ------------------------
# Remove doubled-up and trailing slashes, "." path components,
# and cancel out any ".." path components in PATH after making
# it an absolute path.
func_normal_abspath ()
{
    $debug_cmd

    # These SED scripts presuppose an absolute path with a trailing slash.
    _G_pathcar='s|^/\([^/]*\).*$|\1|'
    _G_pathcdr='s|^/[^/]*||'
    _G_removedotparts=':dotsl
		s|/\./|/|g
		t dotsl
		s|/\.$|/|'
    _G_collapseslashes='s|/\{1,\}|/|g'
    _G_finalslash='s|/*$|/|'

    # Start from root dir and reassemble the path.
    func_normal_abspath_result=
    func_normal_abspath_tpath=$1
    func_normal_abspath_altnamespace=
    case $func_normal_abspath_tpath in
      "")
        # Empty path, that just means $cwd.
        func_stripname '' '/' "`pwd`"
        func_normal_abspath_result=$func_stripname_result
        return
        ;;
      # The next three entries are used to spot a run of precisely
      # two leading slashes without using negated character classes;
      # we take advantage of case's first-match behaviour.
      ///*)
        # Unusual form of absolute path, do nothing.
        ;;
      //*)
        # Not necessarily an ordinary path; POSIX reserves leading '//'
        # and for example Cygwin uses it to access remote file shares
        # over CIFS/SMB, so we conserve a leading double slash if found.
        func_normal_abspath_altnamespace=/
        ;;
      /*)
        # Absolute path, do nothing.
        ;;
      *)
        # Relative path, prepend $cwd.
        func_normal_abspath_tpath=`pwd`/$func_normal_abspath_tpath
        ;;
    esac

    # Cancel out all the simple stuff to save iterations.  We also want
    # the path to end with a slash for ease of parsing, so make sure
    # there is one (and only one) here.
    func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
          -e "$_G_removedotparts" -e "$_G_collapseslashes" -e "$_G_finalslash"`
    while :; do
      # Processed it all yet?
      if test / = "$func_normal_abspath_tpath"; then
        # If we ascended to the root using ".." the result may be empty now.
        if test -z "$func_normal_abspath_result"; then
          func_normal_abspath_result=/
        fi
        break
      fi
      func_normal_abspath_tcomponent=`$ECHO "$func_normal_abspath_tpath" | $SED \
          -e "$_G_pathcar"`
      func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \
          -e "$_G_pathcdr"`
      # Figure out what to do with it
      case $func_normal_abspath_tcomponent in
        "")
          # Trailing empty path component, ignore it.
          ;;
        ..)
          # Parent dir; strip last assembled component from result.
          func_dirname "$func_normal_abspath_result"
          func_normal_abspath_result=$func_dirname_result
          ;;
        *)
          # Actual path component, append it.
          func_append func_normal_abspath_result "/$func_normal_abspath_tcomponent"
          ;;
      esac
    done
    # Restore leading double-slash if one was found on entry.
    func_normal_abspath_result=$func_normal_abspath_altnamespace$func_normal_abspath_result
}


# func_notquiet ARG...
# --------------------
# Echo program name prefixed message only when not in quiet mode.
func_notquiet ()
{
    $debug_cmd

    $opt_quiet || func_echo ${1+"$@"}

    # A bug in bash halts the script if the last line of a function
    # fails when set -e is in force, so we need another command to
    # work around that:
    :
}


# func_relative_path SRCDIR DSTDIR
# --------------------------------
# Set func_relative_path_result to the relative path from SRCDIR to DSTDIR.
func_relative_path ()
{
    $debug_cmd

    func_relative_path_result=
    func_normal_abspath "$1"
    func_relative_path_tlibdir=$func_normal_abspath_result
    func_normal_abspath "$2"
    func_relative_path_tbindir=$func_normal_abspath_result

    # Ascend the tree starting from libdir
    while :; do
      # check if we have found a prefix of bindir
      case $func_relative_path_tbindir in
        $func_relative_path_tlibdir)
          # found an exact match
          func_relative_path_tcancelled=
          break
          ;;
        $func_relative_path_tlibdir*)
          # found a matching prefix
          func_stripname "$func_relative_path_tlibdir" '' "$func_relative_path_tbindir"
          func_relative_path_tcancelled=$func_stripname_result
          if test -z "$func_relative_path_result"; then
            func_relative_path_result=.
          fi
          break
          ;;
        *)
          func_dirname $func_relative_path_tlibdir
          func_relative_path_tlibdir=$func_dirname_result
          if test -z "$func_relative_path_tlibdir"; then
            # Have to descend all the way to the root!
            func_relative_path_result=../$func_relative_path_result
            func_relative_path_tcancelled=$func_relative_path_tbindir
            break
          fi
          func_relative_path_result=../$func_relative_path_result
          ;;
      esac
    done

    # Now calculate path; take care to avoid doubling-up slashes.
    func_stripname '' '/' "$func_relative_path_result"
    func_relative_path_result=$func_stripname_result
    func_stripname '/' '/' "$func_relative_path_tcancelled"
    if test -n "$func_stripname_result"; then
      func_append func_relative_path_result "/$func_stripname_result"
    fi

    # Normalisation. If bindir is libdir, return '.' else relative path.
    if test -n "$func_relative_path_result"; then
      func_stripname './' '' "$func_relative_path_result"
      func_relative_path_result=$func_stripname_result
    fi

    test -n "$func_relative_path_result" || func_relative_path_result=.

    :
}


# func_quote_for_eval ARG...
# --------------------------
# Aesthetically quote ARGs to be evaled later.
# This function returns two values:
#   i) func_quote_for_eval_result
#      double-quoted, suitable for a subsequent eval
#  ii) func_quote_for_eval_unquoted_result
#      has all characters that are still active within double
#      quotes backslashified.
func_quote_for_eval ()
{
    $debug_cmd

    func_quote_for_eval_unquoted_result=
    func_quote_for_eval_result=
    while test 0 -lt $#; do
      case $1 in
        *[\\\`\"\$]*)
	  _G_unquoted_arg=`printf '%s\n' "$1" |$SED "$sed_quote_subst"` ;;
        *)
          _G_unquoted_arg=$1 ;;
      esac
      if test -n "$func_quote_for_eval_unquoted_result"; then
	func_append func_quote_for_eval_unquoted_result " $_G_unquoted_arg"
      else
        func_append func_quote_for_eval_unquoted_result "$_G_unquoted_arg"
      fi

      case $_G_unquoted_arg in
        # Double-quote args containing shell metacharacters to delay
        # word splitting, command substitution and variable expansion
        # for a subsequent eval.
        # Many Bourne shells cannot handle close brackets correctly
        # in scan sets, so we specify it separately.
        *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
          _G_quoted_arg=\"$_G_unquoted_arg\"
          ;;
        *)
          _G_quoted_arg=$_G_unquoted_arg
	  ;;
      esac

      if test -n "$func_quote_for_eval_result"; then
	func_append func_quote_for_eval_result " $_G_quoted_arg"
      else
        func_append func_quote_for_eval_result "$_G_quoted_arg"
      fi
      shift
    done
}


# func_quote_for_expand ARG
# -------------------------
# Aesthetically quote ARG to be evaled later; same as above,
# but do not quote variable references.
func_quote_for_expand ()
{
    $debug_cmd

    case $1 in
      *[\\\`\"]*)
	_G_arg=`$ECHO "$1" | $SED \
	    -e "$sed_double_quote_subst" -e "$sed_double_backslash"` ;;
      *)
        _G_arg=$1 ;;
    esac

    case $_G_arg in
      # Double-quote args containing shell metacharacters to delay
      # word splitting and command substitution for a subsequent eval.
      # Many Bourne shells cannot handle close brackets correctly
      # in scan sets, so we specify it separately.
      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
        _G_arg=\"$_G_arg\"
        ;;
    esac

    func_quote_for_expand_result=$_G_arg
}


# func_stripname PREFIX SUFFIX NAME
# ---------------------------------
# strip PREFIX and SUFFIX from NAME, and store in func_stripname_result.
# PREFIX and SUFFIX must not contain globbing or regex special
# characters, hashes, percent signs, but SUFFIX may contain a leading
# dot (in which case that matches only a dot).
if test yes = "$_G_HAVE_XSI_OPS"; then
  eval 'func_stripname ()
  {
    $debug_cmd

    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are
    # positional parameters, so assign one to ordinary variable first.
    func_stripname_result=$3
    func_stripname_result=${func_stripname_result#"$1"}
    func_stripname_result=${func_stripname_result%"$2"}
  }'
else
  func_stripname ()
  {
    $debug_cmd

    case $2 in
      .*) func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%\\\\$2\$%%"`;;
      *)  func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%$2\$%%"`;;
    esac
  }
fi


# func_show_eval CMD [FAIL_EXP]
# -----------------------------
# Unless opt_quiet is true, then output CMD.  Then, if opt_dryrun is
# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
# is given, then evaluate it.
func_show_eval ()
{
    $debug_cmd

    _G_cmd=$1
    _G_fail_exp=${2-':'}

    func_quote_for_expand "$_G_cmd"
    eval "func_notquiet $func_quote_for_expand_result"

    $opt_dry_run || {
      eval "$_G_cmd"
      _G_status=$?
      if test 0 -ne "$_G_status"; then
	eval "(exit $_G_status); $_G_fail_exp"
      fi
    }
}


# func_show_eval_locale CMD [FAIL_EXP]
# ------------------------------------
# Unless opt_quiet is true, then output CMD.  Then, if opt_dryrun is
# not true, evaluate CMD.  If the evaluation of CMD fails, and FAIL_EXP
# is given, then evaluate it.  Use the saved locale for evaluation.
func_show_eval_locale ()
{
    $debug_cmd

    _G_cmd=$1
    _G_fail_exp=${2-':'}

    $opt_quiet || {
      func_quote_for_expand "$_G_cmd"
      eval "func_echo $func_quote_for_expand_result"
    }

    $opt_dry_run || {
      eval "$_G_user_locale
	    $_G_cmd"
      _G_status=$?
      eval "$_G_safe_locale"
      if test 0 -ne "$_G_status"; then
	eval "(exit $_G_status); $_G_fail_exp"
      fi
    }
}


# func_tr_sh
# ----------
# Turn $1 into a string suitable for a shell variable name.
# Result is stored in $func_tr_sh_result.  All characters
# not in the set a-zA-Z0-9_ are replaced with '_'. Further,
# if $1 begins with a digit, a '_' is prepended as well.
func_tr_sh ()
{
    $debug_cmd

    case $1 in
    [0-9]* | *[!a-zA-Z0-9_]*)
      func_tr_sh_result=`$ECHO "$1" | $SED -e 's/^\([0-9]\)/_\1/' -e 's/[^a-zA-Z0-9_]/_/g'`
      ;;
    * )
      func_tr_sh_result=$1
      ;;
    esac
}


# func_verbose ARG...
# -------------------
# Echo program name prefixed message in verbose mode only.
func_verbose ()
{
    $debug_cmd

    $opt_verbose && func_echo "$*"

    :
}


# func_warn_and_continue ARG...
# -----------------------------
# Echo program name prefixed warning message to standard error.
func_warn_and_continue ()
{
    $debug_cmd

    $require_term_colors

    func_echo_infix_1 "${tc_red}warning$tc_reset" "$*" >&2
}


# func_warning CATEGORY ARG...
# ----------------------------
# Echo program name prefixed warning message to standard error. Warning
# messages can be filtered according to CATEGORY, where this function
# elides messages where CATEGORY is not listed in the global variable
# 'opt_warning_types'.
func_warning ()
{
    $debug_cmd

    # CATEGORY must be in the warning_categories list!
    case " $warning_categories " in
      *" $1 "*) ;;
      *) func_internal_error "invalid warning category '$1'" ;;
    esac

    _G_category=$1
    shift

    case " $opt_warning_types " in
      *" $_G_category "*) $warning_func ${1+"$@"} ;;
    esac
}


# func_sort_ver VER1 VER2
# -----------------------
# 'sort -V' is not generally available.
# Note this deviates from the version comparison in automake
# in that it treats 1.5 < 1.5.0, and treats 1.4.4a < 1.4-p3a
# but this should suffice as we won't be specifying old
# version formats or redundant trailing .0 in bootstrap.conf.
# If we did want full compatibility then we should probably
# use m4_version_compare from autoconf.
func_sort_ver ()
{
    $debug_cmd

    printf '%s\n%s\n' "$1" "$2" \
      | sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n -k 6,6n -k 7,7n -k 8,8n -k 9,9n
}

# func_lt_ver PREV CURR
# ---------------------
# Return true if PREV and CURR are in the correct order according to
# func_sort_ver, otherwise false.  Use it like this:
#
#  func_lt_ver "$prev_ver" "$proposed_ver" || func_fatal_error "..."
func_lt_ver ()
{
    $debug_cmd

    test "x$1" = x`func_sort_ver "$1" "$2" | $SED 1q`
}


# Local variables:
# mode: shell-script
# sh-indentation: 2
# eval: (add-hook 'before-save-hook 'time-stamp)
# time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC"
# time-stamp-time-zone: "UTC"
# End:
#! /bin/sh

# Set a version string for this script.
scriptversion=2014-01-07.03; # UTC

# A portable, pluggable option parser for Bourne shell.
# Written by Gary V. Vaughan, 2010

# Copyright (C) 2010-2015 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions.  There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Please report bugs or propose patches to gary@gnu.org.


## ------ ##
## Usage. ##
## ------ ##

# This file is a library for parsing options in your shell scripts along
# with assorted other useful supporting features that you can make use
# of too.
#
# For the simplest scripts you might need only:
#
#   #!/bin/sh
#   . relative/path/to/funclib.sh
#   . relative/path/to/options-parser
#   scriptversion=1.0
#   func_options ${1+"$@"}
#   eval set dummy "$func_options_result"; shift
#   ...rest of your script...
#
# In order for the '--version' option to work, you will need to have a
# suitably formatted comment like the one at the top of this file
# starting with '# Written by ' and ending with '# warranty; '.
#
# For '-h' and '--help' to work, you will also need a one line
# description of your script's purpose in a comment directly above the
# '# Written by ' line, like the one at the top of this file.
#
# The default options also support '--debug', which will turn on shell
# execution tracing (see the comment above debug_cmd below for another
# use), and '--verbose' and the func_verbose function to allow your script
# to display verbose messages only when your user has specified
# '--verbose'.
#
# After sourcing this file, you can plug processing for additional
# options by amending the variables from the 'Configuration' section
# below, and following the instructions in the 'Option parsing'
# section further down.

## -------------- ##
## Configuration. ##
## -------------- ##

# You should override these variables in your script after sourcing this
# file so that they reflect the customisations you have added to the
# option parser.

# The usage line for option parsing errors and the start of '-h' and
# '--help' output messages. You can embed shell variables for delayed
# expansion at the time the message is displayed, but you will need to
# quote other shell meta-characters carefully to prevent them being
# expanded when the contents are evaled.
usage='$progpath [OPTION]...'

# Short help message in response to '-h' and '--help'.  Add to this or
# override it after sourcing this library to reflect the full set of
# options your script accepts.
usage_message="\
       --debug        enable verbose shell tracing
   -W, --warnings=CATEGORY
                      report the warnings falling in CATEGORY [all]
   -v, --verbose      verbosely report processing
       --version      print version information and exit
   -h, --help         print short or long help message and exit
"

# Additional text appended to 'usage_message' in response to '--help'.
long_help_message="
Warning categories include:
       'all'          show all warnings
       'none'         turn off all the warnings
       'error'        warnings are treated as fatal errors"

# Help message printed before fatal option parsing errors.
fatal_help="Try '\$progname --help' for more information."


## ------------------------- ##
## Hook function management. ##
## ------------------------- ##

# This section contains functions for adding, removing, and running hooks
# to the main code.  A hook is just a named list of of function, that can
# be run in order later on.

# func_hookable FUNC_NAME
# -----------------------
# Declare that FUNC_NAME will run hooks added with
# 'func_add_hook FUNC_NAME ...'.
func_hookable ()
{
    $debug_cmd

    func_append hookable_fns " $1"
}


# func_add_hook FUNC_NAME HOOK_FUNC
# ---------------------------------
# Request that FUNC_NAME call HOOK_FUNC before it returns.  FUNC_NAME must
# first have been declared "hookable" by a call to 'func_hookable'.
func_add_hook ()
{
    $debug_cmd

    case " $hookable_fns " in
      *" $1 "*) ;;
      *) func_fatal_error "'$1' does not accept hook functions." ;;
    esac

    eval func_append ${1}_hooks '" $2"'
}


# func_remove_hook FUNC_NAME HOOK_FUNC
# ------------------------------------
# Remove HOOK_FUNC from the list of functions called by FUNC_NAME.
func_remove_hook ()
{
    $debug_cmd

    eval ${1}_hooks='`$ECHO "\$'$1'_hooks" |$SED "s| '$2'||"`'
}


# func_run_hooks FUNC_NAME [ARG]...
# ---------------------------------
# Run all hook functions registered to FUNC_NAME.
# It is assumed that the list of hook functions contains nothing more
# than a whitespace-delimited list of legal shell function names, and
# no effort is wasted trying to catch shell meta-characters or preserve
# whitespace.
func_run_hooks ()
{
    $debug_cmd

    case " $hookable_fns " in
      *" $1 "*) ;;
      *) func_fatal_error "'$1' does not support hook funcions.n" ;;
    esac

    eval _G_hook_fns=\$$1_hooks; shift

    for _G_hook in $_G_hook_fns; do
      eval $_G_hook '"$@"'

      # store returned options list back into positional
      # parameters for next 'cmd' execution.
      eval _G_hook_result=\$${_G_hook}_result
      eval set dummy "$_G_hook_result"; shift
    done

    func_quote_for_eval ${1+"$@"}
    func_run_hooks_result=$func_quote_for_eval_result
}


## --------------- ##
## Option parsing. ##
## --------------- ##

# In order to add your own option parsing hooks, you must accept the
# full positional parameter list in your hook function, remove any
# options that you action, and then pass back the remaining unprocessed
# options in '<hooked_function_name>_result', escaped suitably for
# 'eval'.  Like this:
#
#    my_options_prep ()
#    {
#        $debug_cmd
#
#        # Extend the existing usage message.
#        usage_message=$usage_message'
#      -s, --silent       don'\''t print informational messages
#    '
#
#        func_quote_for_eval ${1+"$@"}
#        my_options_prep_result=$func_quote_for_eval_result
#    }
#    func_add_hook func_options_prep my_options_prep
#
#
#    my_silent_option ()
#    {
#        $debug_cmd
#
#        # Note that for efficiency, we parse as many options as we can
#        # recognise in a loop before passing the remainder back to the
#        # caller on the first unrecognised argument we encounter.
#        while test $# -gt 0; do
#          opt=$1; shift
#          case $opt in
#            --silent|-s) opt_silent=: ;;
#            # Separate non-argument short options:
#            -s*)         func_split_short_opt "$_G_opt"
#                         set dummy "$func_split_short_opt_name" \
#                             "-$func_split_short_opt_arg" ${1+"$@"}
#                         shift
#                         ;;
#            *)            set dummy "$_G_opt" "$*"; shift; break ;;
#          esac
#        done
#
#        func_quote_for_eval ${1+"$@"}
#        my_silent_option_result=$func_quote_for_eval_result
#    }
#    func_add_hook func_parse_options my_silent_option
#
#
#    my_option_validation ()
#    {
#        $debug_cmd
#
#        $opt_silent && $opt_verbose && func_fatal_help "\
#    '--silent' and '--verbose' options are mutually exclusive."
#
#        func_quote_for_eval ${1+"$@"}
#        my_option_validation_result=$func_quote_for_eval_result
#    }
#    func_add_hook func_validate_options my_option_validation
#
# You'll alse need to manually amend $usage_message to reflect the extra
# options you parse.  It's preferable to append if you can, so that
# multiple option parsing hooks can be added safely.


# func_options [ARG]...
# ---------------------
# All the functions called inside func_options are hookable. See the
# individual implementations for details.
func_hookable func_options
func_options ()
{
    $debug_cmd

    func_options_prep ${1+"$@"}
    eval func_parse_options \
        ${func_options_prep_result+"$func_options_prep_result"}
    eval func_validate_options \
        ${func_parse_options_result+"$func_parse_options_result"}

    eval func_run_hooks func_options \
        ${func_validate_options_result+"$func_validate_options_result"}

    # save modified positional parameters for caller
    func_options_result=$func_run_hooks_result
}


# func_options_prep [ARG]...
# --------------------------
# All initialisations required before starting the option parse loop.
# Note that when calling hook functions, we pass through the list of
# positional parameters.  If a hook function modifies that list, and
# needs to propogate that back to rest of this script, then the complete
# modified list must be put in 'func_run_hooks_result' before
# returning.
func_hookable func_options_prep
func_options_prep ()
{
    $debug_cmd

    # Option defaults:
    opt_verbose=false
    opt_warning_types=

    func_run_hooks func_options_prep ${1+"$@"}

    # save modified positional parameters for caller
    func_options_prep_result=$func_run_hooks_result
}


# func_parse_options [ARG]...
# ---------------------------
# The main option parsing loop.
func_hookable func_parse_options
func_parse_options ()
{
    $debug_cmd

    func_parse_options_result=

    # this just eases exit handling
    while test $# -gt 0; do
      # Defer to hook functions for initial option parsing, so they
      # get priority in the event of reusing an option name.
      func_run_hooks func_parse_options ${1+"$@"}

      # Adjust func_parse_options positional parameters to match
      eval set dummy "$func_run_hooks_result"; shift

      # Break out of the loop if we already parsed every option.
      test $# -gt 0 || break

      _G_opt=$1
      shift
      case $_G_opt in
        --debug|-x)   debug_cmd='set -x'
                      func_echo "enabling shell trace mode"
                      $debug_cmd
                      ;;

        --no-warnings|--no-warning|--no-warn)
                      set dummy --warnings none ${1+"$@"}
                      shift
		      ;;

        --warnings|--warning|-W)
                      test $# = 0 && func_missing_arg $_G_opt && break
                      case " $warning_categories $1" in
                        *" $1 "*)
                          # trailing space prevents matching last $1 above
                          func_append_uniq opt_warning_types " $1"
                          ;;
                        *all)
                          opt_warning_types=$warning_categories
                          ;;
                        *none)
                          opt_warning_types=none
                          warning_func=:
                          ;;
                        *error)
                          opt_warning_types=$warning_categories
                          warning_func=func_fatal_error
                          ;;
                        *)
                          func_fatal_error \
                             "unsupported warning category: '$1'"
                          ;;
                      esac
                      shift
                      ;;

        --verbose|-v) opt_verbose=: ;;
        --version)    func_version ;;
        -\?|-h)       func_usage ;;
        --help)       func_help ;;

	# Separate optargs to long options (plugins may need this):
	--*=*)        func_split_equals "$_G_opt"
	              set dummy "$func_split_equals_lhs" \
                          "$func_split_equals_rhs" ${1+"$@"}
                      shift
                      ;;

       # Separate optargs to short options:
        -W*)
                      func_split_short_opt "$_G_opt"
                      set dummy "$func_split_short_opt_name" \
                          "$func_split_short_opt_arg" ${1+"$@"}
                      shift
                      ;;

        # Separate non-argument short options:
        -\?*|-h*|-v*|-x*)
                      func_split_short_opt "$_G_opt"
                      set dummy "$func_split_short_opt_name" \
                          "-$func_split_short_opt_arg" ${1+"$@"}
                      shift
                      ;;

        --)           break ;;
        -*)           func_fatal_help "unrecognised option: '$_G_opt'" ;;
        *)            set dummy "$_G_opt" ${1+"$@"}; shift; break ;;
      esac
    done

    # save modified positional parameters for caller
    func_quote_for_eval ${1+"$@"}
    func_parse_options_result=$func_quote_for_eval_result
}


# func_validate_options [ARG]...
# ------------------------------
# Perform any sanity checks on option settings and/or unconsumed
# arguments.
func_hookable func_validate_options
func_validate_options ()
{
    $debug_cmd

    # Display all warnings if -W was not given.
    test -n "$opt_warning_types" || opt_warning_types=" $warning_categories"

    func_run_hooks func_validate_options ${1+"$@"}

    # Bail if the options were screwed!
    $exit_cmd $EXIT_FAILURE

    # save modified positional parameters for caller
    func_validate_options_result=$func_run_hooks_result
}


## ----------------- ##
## Helper functions. ##
## ----------------- ##

# This section contains the helper functions used by the rest of the
# hookable option parser framework in ascii-betical order.


# func_fatal_help ARG...
# ----------------------
# Echo program name prefixed message to standard error, followed by
# a help hint, and exit.
func_fatal_help ()
{
    $debug_cmd

    eval \$ECHO \""Usage: $usage"\"
    eval \$ECHO \""$fatal_help"\"
    func_error ${1+"$@"}
    exit $EXIT_FAILURE
}


# func_help
# ---------
# Echo long help message to standard output and exit.
func_help ()
{
    $debug_cmd

    func_usage_message
    $ECHO "$long_help_message"
    exit 0
}


# func_missing_arg ARGNAME
# ------------------------
# Echo program name prefixed message to standard error and set global
# exit_cmd.
func_missing_arg ()
{
    $debug_cmd

    func_error "Missing argument for '$1'."
    exit_cmd=exit
}


# func_split_equals STRING
# ------------------------
# Set func_split_equals_lhs and func_split_equals_rhs shell variables after
# splitting STRING at the '=' sign.
test -z "$_G_HAVE_XSI_OPS" \
    && (eval 'x=a/b/c;
      test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \
    && _G_HAVE_XSI_OPS=yes

if test yes = "$_G_HAVE_XSI_OPS"
then
  # This is an XSI compatible shell, allowing a faster implementation...
  eval 'func_split_equals ()
  {
      $debug_cmd

      func_split_equals_lhs=${1%%=*}
      func_split_equals_rhs=${1#*=}
      test "x$func_split_equals_lhs" = "x$1" \
        && func_split_equals_rhs=
  }'
else
  # ...otherwise fall back to using expr, which is often a shell builtin.
  func_split_equals ()
  {
      $debug_cmd

      func_split_equals_lhs=`expr "x$1" : 'x\([^=]*\)'`
      func_split_equals_rhs=
      test "x$func_split_equals_lhs" = "x$1" \
        || func_split_equals_rhs=`expr "x$1" : 'x[^=]*=\(.*\)$'`
  }
fi #func_split_equals


# func_split_short_opt SHORTOPT
# -----------------------------
# Set func_split_short_opt_name and func_split_short_opt_arg shell
# variables after splitting SHORTOPT after the 2nd character.
if test yes = "$_G_HAVE_XSI_OPS"
then
  # This is an XSI compatible shell, allowing a faster implementation...
  eval 'func_split_short_opt ()
  {
      $debug_cmd

      func_split_short_opt_arg=${1#??}
      func_split_short_opt_name=${1%"$func_split_short_opt_arg"}
  }'
else
  # ...otherwise fall back to using expr, which is often a shell builtin.
  func_split_short_opt ()
  {
      $debug_cmd

      func_split_short_opt_name=`expr "x$1" : 'x-\(.\)'`
      func_split_short_opt_arg=`expr "x$1" : 'x-.\(.*\)$'`
  }
fi #func_split_short_opt


# func_usage
# ----------
# Echo short help message to standard output and exit.
func_usage ()
{
    $debug_cmd

    func_usage_message
    $ECHO "Run '$progname --help |${PAGER-more}' for full usage"
    exit 0
}


# func_usage_message
# ------------------
# Echo short help message to standard output.
func_usage_message ()
{
    $debug_cmd

    eval \$ECHO \""Usage: $usage"\"
    echo
    $SED -n 's|^# ||
        /^Written by/{
          x;p;x
        }
	h
	/^Written by/q' < "$progpath"
    echo
    eval \$ECHO \""$usage_message"\"
}


# func_version
# ------------
# Echo version message to standard output and exit.
func_version ()
{
    $debug_cmd

    printf '%s\n' "$progname $scriptversion"
    $SED -n '
        /(C)/!b go
        :more
        /\./!{
          N
          s|\n# | |
          b more
        }
        :go
        /^# Written by /,/# warranty; / {
          s|^# ||
          s|^# *$||
          s|\((C)\)[ 0-9,-]*[ ,-]\([1-9][0-9]* \)|\1 \2|
          p
        }
        /^# Written by / {
          s|^# ||
          p
        }
        /^warranty; /q' < "$progpath"

    exit $?
}


# Local variables:
# mode: shell-script
# sh-indentation: 2
# eval: (add-hook 'before-save-hook 'time-stamp)
# time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC"
# time-stamp-time-zone: "UTC"
# End:

# Set a version string.
scriptversion='(GNU libtool) 2.4.6'


# func_echo ARG...
# ----------------
# Libtool also displays the current mode in messages, so override
# funclib.sh func_echo with this custom definition.
func_echo ()
{
    $debug_cmd

    _G_message=$*

    func_echo_IFS=$IFS
    IFS=$nl
    for _G_line in $_G_message; do
      IFS=$func_echo_IFS
      $ECHO "$progname${opt_mode+: $opt_mode}: $_G_line"
    done
    IFS=$func_echo_IFS
}


# func_warning ARG...
# -------------------
# Libtool warnings are not categorized, so override funclib.sh
# func_warning with this simpler definition.
func_warning ()
{
    $debug_cmd

    $warning_func ${1+"$@"}
}


## ---------------- ##
## Options parsing. ##
## ---------------- ##

# Hook in the functions to make sure our own options are parsed during
# the option parsing loop.

usage='$progpath [OPTION]... [MODE-ARG]...'

# Short help message in response to '-h'.
usage_message="Options:
       --config             show all configuration variables
       --debug              enable verbose shell tracing
   -n, --dry-run            display commands without modifying any files
       --features           display basic configuration information and exit
       --mode=MODE          use operation mode MODE
       --no-warnings        equivalent to '-Wnone'
       --preserve-dup-deps  don't remove duplicate dependency libraries
       --quiet, --silent    don't print informational messages
       --tag=TAG            use configuration variables from tag TAG
   -v, --verbose            print more informational messages than default
       --version            print version information
   -W, --warnings=CATEGORY  report the warnings falling in CATEGORY [all]
   -h, --help, --help-all   print short, long, or detailed help message
"

# Additional text appended to 'usage_message' in response to '--help'.
func_help ()
{
    $debug_cmd

    func_usage_message
    $ECHO "$long_help_message

MODE must be one of the following:

       clean           remove files from the build directory
       compile         compile a source file into a libtool object
       execute         automatically set library path, then run a program
       finish          complete the installation of libtool libraries
       install         install libraries or executables
       link            create a library or an executable
       uninstall       remove libraries from an installed directory

MODE-ARGS vary depending on the MODE.  When passed as first option,
'--mode=MODE' may be abbreviated as 'MODE' or a unique abbreviation of that.
Try '$progname --help --mode=MODE' for a more detailed description of MODE.

When reporting a bug, please describe a test case to reproduce it and
include the following information:

       host-triplet:   $host
       shell:          $SHELL
       compiler:       $LTCC
       compiler flags: $LTCFLAGS
       linker:         $LD (gnu? $with_gnu_ld)
       version:        $progname (GNU libtool) 2.4.6
       automake:       `($AUTOMAKE --version) 2>/dev/null |$SED 1q`
       autoconf:       `($AUTOCONF --version) 2>/dev/null |$SED 1q`

Report bugs to <bug-libtool@gnu.org>.
GNU libtool home page: <http://www.gnu.org/s/libtool/>.
General help using GNU software: <http://www.gnu.org/gethelp/>."
    exit 0
}


# func_lo2o OBJECT-NAME
# ---------------------
# Transform OBJECT-NAME from a '.lo' suffix to the platform specific
# object suffix.

lo2o=s/\\.lo\$/.$objext/
o2lo=s/\\.$objext\$/.lo/

if test yes = "$_G_HAVE_XSI_OPS"; then
  eval 'func_lo2o ()
  {
    case $1 in
      *.lo) func_lo2o_result=${1%.lo}.$objext ;;
      *   ) func_lo2o_result=$1               ;;
    esac
  }'

  # func_xform LIBOBJ-OR-SOURCE
  # ---------------------------
  # Transform LIBOBJ-OR-SOURCE from a '.o' or '.c' (or otherwise)
  # suffix to a '.lo' libtool-object suffix.
  eval 'func_xform ()
  {
    func_xform_result=${1%.*}.lo
  }'
else
  # ...otherwise fall back to using sed.
  func_lo2o ()
  {
    func_lo2o_result=`$ECHO "$1" | $SED "$lo2o"`
  }

  func_xform ()
  {
    func_xform_result=`$ECHO "$1" | $SED 's|\.[^.]*$|.lo|'`
  }
fi


# func_fatal_configuration ARG...
# -------------------------------
# Echo program name prefixed message to standard error, followed by
# a configuration failure hint, and exit.
func_fatal_configuration ()
{
    func__fatal_error ${1+"$@"} \
      "See the $PACKAGE documentation for more information." \
      "Fatal configuration error."
}


# func_config
# -----------
# Display the configuration for all the tags in this script.
func_config ()
{
    re_begincf='^# ### BEGIN LIBTOOL'
    re_endcf='^# ### END LIBTOOL'

    # Default configuration.
    $SED "1,/$re_begincf CONFIG/d;/$re_endcf CONFIG/,\$d" < "$progpath"

    # Now print the configurations for the tags.
    for tagname in $taglist; do
      $SED -n "/$re_begincf TAG CONFIG: $tagname\$/,/$re_endcf TAG CONFIG: $tagname\$/p" < "$progpath"
    done

    exit $?
}


# func_features
# -------------
# Display the features supported by this script.
func_features ()
{
    echo "host: $host"
    if test yes = "$build_libtool_libs"; then
      echo "enable shared libraries"
    else
      echo "disable shared libraries"
    fi
    if test yes = "$build_old_libs"; then
      echo "enable static libraries"
    else
      echo "disable static libraries"
    fi

    exit $?
}


# func_enable_tag TAGNAME
# -----------------------
# Verify that TAGNAME is valid, and either flag an error and exit, or
# enable the TAGNAME tag.  We also add TAGNAME to the global $taglist
# variable here.
func_enable_tag ()
{
    # Global variable:
    tagname=$1

    re_begincf="^# ### BEGIN LIBTOOL TAG CONFIG: $tagname\$"
    re_endcf="^# ### END LIBTOOL TAG CONFIG: $tagname\$"
    sed_extractcf=/$re_begincf/,/$re_endcf/p

    # Validate tagname.
    case $tagname in
      *[!-_A-Za-z0-9,/]*)
        func_fatal_error "invalid tag name: $tagname"
        ;;
    esac

    # Don't test for the "default" C tag, as we know it's
    # there but not specially marked.
    case $tagname in
        CC) ;;
    *)
        if $GREP "$re_begincf" "$progpath" >/dev/null 2>&1; then
	  taglist="$taglist $tagname"

	  # Evaluate the configuration.  Be careful to quote the path
	  # and the sed script, to avoid splitting on whitespace, but
	  # also don't use non-portable quotes within backquotes within
	  # quotes we have to do it in 2 steps:
	  extractedcf=`$SED -n -e "$sed_extractcf" < "$progpath"`
	  eval "$extractedcf"
        else
	  func_error "ignoring unknown tag $tagname"
        fi
        ;;
    esac
}


# func_check_version_match
# ------------------------
# Ensure that we are using m4 macros, and libtool script from the same
# release of libtool.
func_check_version_match ()
{
    if test "$package_revision" != "$macro_revision"; then
      if test "$VERSION" != "$macro_version"; then
        if test -z "$macro_version"; then
          cat >&2 <<_LT_EOF
$progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
$progname: definition of this LT_INIT comes from an older release.
$progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
$progname: and run autoconf again.
_LT_EOF
        else
          cat >&2 <<_LT_EOF
$progname: Version mismatch error.  This is $PACKAGE $VERSION, but the
$progname: definition of this LT_INIT comes from $PACKAGE $macro_version.
$progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION
$progname: and run autoconf again.
_LT_EOF
        fi
      else
        cat >&2 <<_LT_EOF
$progname: Version mismatch error.  This is $PACKAGE $VERSION, revision $package_revision,
$progname: but the definition of this LT_INIT comes from revision $macro_revision.
$progname: You should recreate aclocal.m4 with macros from revision $package_revision
$progname: of $PACKAGE $VERSION and run autoconf again.
_LT_EOF
      fi

      exit $EXIT_MISMATCH
    fi
}


# libtool_options_prep [ARG]...
# -----------------------------
# Preparation for options parsed by libtool.
libtool_options_prep ()
{
    $debug_mode

    # Option defaults:
    opt_config=false
    opt_dlopen=
    opt_dry_run=false
    opt_help=false
    opt_mode=
    opt_preserve_dup_deps=false
    opt_quiet=false

    nonopt=
    preserve_args=

    # Shorthand for --mode=foo, only valid as the first argument
    case $1 in
    clean|clea|cle|cl)
      shift; set dummy --mode clean ${1+"$@"}; shift
      ;;
    compile|compil|compi|comp|com|co|c)
      shift; set dummy --mode compile ${1+"$@"}; shift
      ;;
    execute|execut|execu|exec|exe|ex|e)
      shift; set dummy --mode execute ${1+"$@"}; shift
      ;;
    finish|finis|fini|fin|fi|f)
      shift; set dummy --mode finish ${1+"$@"}; shift
      ;;
    install|instal|insta|inst|ins|in|i)
      shift; set dummy --mode install ${1+"$@"}; shift
      ;;
    link|lin|li|l)
      shift; set dummy --mode link ${1+"$@"}; shift
      ;;
    uninstall|uninstal|uninsta|uninst|unins|unin|uni|un|u)
      shift; set dummy --mode uninstall ${1+"$@"}; shift
      ;;
    esac

    # Pass back the list of options.
    func_quote_for_eval ${1+"$@"}
    libtool_options_prep_result=$func_quote_for_eval_result
}
func_add_hook func_options_prep libtool_options_prep


# libtool_parse_options [ARG]...
# ---------------------------------
# Provide handling for libtool specific options.
libtool_parse_options ()
{
    $debug_cmd

    # Perform our own loop to consume as many options as possible in
    # each iteration.
    while test $# -gt 0; do
      _G_opt=$1
      shift
      case $_G_opt in
        --dry-run|--dryrun|-n)
                        opt_dry_run=:
                        ;;

        --config)       func_config ;;

        --dlopen|-dlopen)
                        opt_dlopen="${opt_dlopen+$opt_dlopen
}$1"
                        shift
                        ;;

        --preserve-dup-deps)
                        opt_preserve_dup_deps=: ;;

        --features)     func_features ;;

        --finish)       set dummy --mode finish ${1+"$@"}; shift ;;

        --help)         opt_help=: ;;

        --help-all)     opt_help=': help-all' ;;

        --mode)         test $# = 0 && func_missing_arg $_G_opt && break
                        opt_mode=$1
                        case $1 in
                          # Valid mode arguments:
                          clean|compile|execute|finish|install|link|relink|uninstall) ;;

                          # Catch anything else as an error
                          *) func_error "invalid argument for $_G_opt"
                             exit_cmd=exit
                             break
                             ;;
                        esac
                        shift
                        ;;

        --no-silent|--no-quiet)
                        opt_quiet=false
                        func_append preserve_args " $_G_opt"
                        ;;

        --no-warnings|--no-warning|--no-warn)
                        opt_warning=false
                        func_append preserve_args " $_G_opt"
                        ;;

        --no-verbose)
                        opt_verbose=false
                        func_append preserve_args " $_G_opt"
                        ;;

        --silent|--quiet)
                        opt_quiet=:
                        opt_verbose=false
                        func_append preserve_args " $_G_opt"
                        ;;

        --tag)          test $# = 0 && func_missing_arg $_G_opt && break
                        opt_tag=$1
                        func_append preserve_args " $_G_opt $1"
                        func_enable_tag "$1"
                        shift
                        ;;

        --verbose|-v)   opt_quiet=false
                        opt_verbose=:
                        func_append preserve_args " $_G_opt"
                        ;;

	# An option not handled by this hook function:
        *)		set dummy "$_G_opt" ${1+"$@"};	shift; break  ;;
      esac
    done


    # save modified positional parameters for caller
    func_quote_for_eval ${1+"$@"}
    libtool_parse_options_result=$func_quote_for_eval_result
}
func_add_hook func_parse_options libtool_parse_options


# libtool_validate_options [ARG]...
# ---------------------------------
# Perform any sanity checks on option settings and/or unconsumed
# arguments.
libtool_validate_options ()
{
    # save first non-option argument
    if test 0 -lt $#; then
      nonopt=$1
      shift
    fi

    # preserve --debug
    test : = "$debug_cmd" || func_append preserve_args " --debug"

    case $host in
      # Solaris2 added to fix http://debbugs.gnu.org/cgi/bugreport.cgi?bug=16452
      # see also: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59788
      *cygwin* | *mingw* | *pw32* | *cegcc* | *solaris2* | *os2*)
        # don't eliminate duplications in $postdeps and $predeps
        opt_duplicate_compiler_generated_deps=:
        ;;
      *)
        opt_duplicate_compiler_generated_deps=$opt_preserve_dup_deps
        ;;
    esac

    $opt_help || {
      # Sanity checks first:
      func_check_version_match

      test yes != "$build_libtool_libs" \
        && test yes != "$build_old_libs" \
        && func_fatal_configuration "not configured to build any kind of library"

      # Darwin sucks
      eval std_shrext=\"$shrext_cmds\"

      # Only execute mode is allowed to have -dlopen flags.
      if test -n "$opt_dlopen" && test execute != "$opt_mode"; then
        func_error "unrecognized option '-dlopen'"
        $ECHO "$help" 1>&2
        exit $EXIT_FAILURE
      fi

      # Change the help message to a mode-specific one.
      generic_help=$help
      help="Try '$progname --help --mode=$opt_mode' for more information."
    }

    # Pass back the unparsed argument list
    func_quote_for_eval ${1+"$@"}
    libtool_validate_options_result=$func_quote_for_eval_result
}
func_add_hook func_validate_options libtool_validate_options


# Process options as early as possible so that --help and --version
# can return quickly.
func_options ${1+"$@"}
eval set dummy "$func_options_result"; shift


## ----------- ##
##    Main.    ##
## ----------- ##

magic='%%%MAGIC variable%%%'
magic_exe='%%%MAGIC EXE variable%%%'

# Global variables.
extracted_archives=
extracted_serial=0

# If this variable is set in any of the actions, the command in it
# will be execed at the end.  This prevents here-documents from being
# left over by shells.
exec_cmd=


# A function that is used when there is no print builtin or printf.
func_fallback_echo ()
{
  eval 'cat <<_LTECHO_EOF
$1
_LTECHO_EOF'
}

# func_generated_by_libtool
# True iff stdin has been generated by Libtool. This function is only
# a basic sanity check; it will hardly flush out determined imposters.
func_generated_by_libtool_p ()
{
  $GREP "^# Generated by .*$PACKAGE" > /dev/null 2>&1
}

# func_lalib_p file
# True iff FILE is a libtool '.la' library or '.lo' object file.
# This function is only a basic sanity check; it will hardly flush out
# determined imposters.
func_lalib_p ()
{
    test -f "$1" &&
      $SED -e 4q "$1" 2>/dev/null | func_generated_by_libtool_p
}

# func_lalib_unsafe_p file
# True iff FILE is a libtool '.la' library or '.lo' object file.
# This function implements the same check as func_lalib_p without
# resorting to external programs.  To this end, it redirects stdin and
# closes it afterwards, without saving the original file descriptor.
# As a safety measure, use it only where a negative result would be
# fatal anyway.  Works if 'file' does not exist.
func_lalib_unsafe_p ()
{
    lalib_p=no
    if test -f "$1" && test -r "$1" && exec 5<&0 <"$1"; then
	for lalib_p_l in 1 2 3 4
	do
	    read lalib_p_line
	    case $lalib_p_line in
		\#\ Generated\ by\ *$PACKAGE* ) lalib_p=yes; break;;
	    esac
	done
	exec 0<&5 5<&-
    fi
    test yes = "$lalib_p"
}

# func_ltwrapper_script_p file
# True iff FILE is a libtool wrapper script
# This function is only a basic sanity check; it will hardly flush out
# determined imposters.
func_ltwrapper_script_p ()
{
    test -f "$1" &&
      $lt_truncate_bin < "$1" 2>/dev/null | func_generated_by_libtool_p
}

# func_ltwrapper_executable_p file
# True iff FILE is a libtool wrapper executable
# This function is only a basic sanity check; it will hardly flush out
# determined imposters.
func_ltwrapper_executable_p ()
{
    func_ltwrapper_exec_suffix=
    case $1 in
    *.exe) ;;
    *) func_ltwrapper_exec_suffix=.exe ;;
    esac
    $GREP "$magic_exe" "$1$func_ltwrapper_exec_suffix" >/dev/null 2>&1
}

# func_ltwrapper_scriptname file
# Assumes file is an ltwrapper_executable
# uses $file to determine the appropriate filename for a
# temporary ltwrapper_script.
func_ltwrapper_scriptname ()
{
    func_dirname_and_basename "$1" "" "."
    func_stripname '' '.exe' "$func_basename_result"
    func_ltwrapper_scriptname_result=$func_dirname_result/$objdir/${func_stripname_result}_ltshwrapper
}

# func_ltwrapper_p file
# True iff FILE is a libtool wrapper script or wrapper executable
# This function is only a basic sanity check; it will hardly flush out
# determined imposters.
func_ltwrapper_p ()
{
    func_ltwrapper_script_p "$1" || func_ltwrapper_executable_p "$1"
}


# func_execute_cmds commands fail_cmd
# Execute tilde-delimited COMMANDS.
# If FAIL_CMD is given, eval that upon failure.
# FAIL_CMD may read-access the current command in variable CMD!
func_execute_cmds ()
{
    $debug_cmd

    save_ifs=$IFS; IFS='~'
    for cmd in $1; do
      IFS=$sp$nl
      eval cmd=\"$cmd\"
      IFS=$save_ifs
      func_show_eval "$cmd" "${2-:}"
    done
    IFS=$save_ifs
}


# func_source file
# Source FILE, adding directory component if necessary.
# Note that it is not necessary on cygwin/mingw to append a dot to
# FILE even if both FILE and FILE.exe exist: automatic-append-.exe
# behavior happens only for exec(3), not for open(2)!  Also, sourcing
# 'FILE.' does not work on cygwin managed mounts.
func_source ()
{
    $debug_cmd

    case $1 in
    */* | *\\*)	. "$1" ;;
    *)		. "./$1" ;;
    esac
}


# func_resolve_sysroot PATH
# Replace a leading = in PATH with a sysroot.  Store the result into
# func_resolve_sysroot_result
func_resolve_sysroot ()
{
  func_resolve_sysroot_result=$1
  case $func_resolve_sysroot_result in
  =*)
    func_stripname '=' '' "$func_resolve_sysroot_result"
    func_resolve_sysroot_result=$lt_sysroot$func_stripname_result
    ;;
  esac
}

# func_replace_sysroot PATH
# If PATH begins with the sysroot, replace it with = and
# store the result into func_replace_sysroot_result.
func_replace_sysroot ()
{
  case $lt_sysroot:$1 in
  ?*:"$lt_sysroot"*)
    func_stripname "$lt_sysroot" '' "$1"
    func_replace_sysroot_result='='$func_stripname_result
    ;;
  *)
    # Including no sysroot.
    func_replace_sysroot_result=$1
    ;;
  esac
}

# func_infer_tag arg
# Infer tagged configuration to use if any are available and
# if one wasn't chosen via the "--tag" command line option.
# Only attempt this if the compiler in the base compile
# command doesn't match the default compiler.
# arg is usually of the form 'gcc ...'
func_infer_tag ()
{
    $debug_cmd

    if test -n "$available_tags" && test -z "$tagname"; then
      CC_quoted=
      for arg in $CC; do
	func_append_quoted CC_quoted "$arg"
      done
      CC_expanded=`func_echo_all $CC`
      CC_quoted_expanded=`func_echo_all $CC_quoted`
      case $@ in
      # Blanks in the command may have been stripped by the calling shell,
      # but not from the CC environment variable when configure was run.
      " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \
      " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*) ;;
      # Blanks at the start of $base_compile will cause this to fail
      # if we don't check for them as well.
      *)
	for z in $available_tags; do
	  if $GREP "^# ### BEGIN LIBTOOL TAG CONFIG: $z$" < "$progpath" > /dev/null; then
	    # Evaluate the configuration.
	    eval "`$SED -n -e '/^# ### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^# ### END LIBTOOL TAG CONFIG: '$z'$/p' < $progpath`"
	    CC_quoted=
	    for arg in $CC; do
	      # Double-quote args containing other shell metacharacters.
	      func_append_quoted CC_quoted "$arg"
	    done
	    CC_expanded=`func_echo_all $CC`
	    CC_quoted_expanded=`func_echo_all $CC_quoted`
	    case "$@ " in
	    " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \
	    " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*)
	      # The compiler in the base compile command matches
	      # the one in the tagged configuration.
	      # Assume this is the tagged configuration we want.
	      tagname=$z
	      break
	      ;;
	    esac
	  fi
	done
	# If $tagname still isn't set, then no tagged configuration
	# was found and let the user know that the "--tag" command
	# line option must be used.
	if test -z "$tagname"; then
	  func_echo "unable to infer tagged configuration"
	  func_fatal_error "specify a tag with '--tag'"
#	else
#	  func_verbose "using $tagname tagged configuration"
	fi
	;;
      esac
    fi
}


# func_write_libtool_object output_name pic_name nonpic_name
# Create a libtool object file (analogous to a ".la" file),
# but don't create it if we're doing a dry run.
func_write_libtool_object ()
{
    write_libobj=$1
    if test yes = "$build_libtool_libs"; then
      write_lobj=\'$2\'
    else
      write_lobj=none
    fi

    if test yes = "$build_old_libs"; then
      write_oldobj=\'$3\'
    else
      write_oldobj=none
    fi

    $opt_dry_run || {
      cat >${write_libobj}T <<EOF
# $write_libobj - a libtool object file
# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
#
# Please DO NOT delete this file!
# It is necessary for linking the library.

# Name of the PIC object.
pic_object=$write_lobj

# Name of the non-PIC object
non_pic_object=$write_oldobj

EOF
      $MV "${write_libobj}T" "$write_libobj"
    }
}


##################################################
# FILE NAME AND PATH CONVERSION HELPER FUNCTIONS #
##################################################

# func_convert_core_file_wine_to_w32 ARG
# Helper function used by file name conversion functions when $build is *nix,
# and $host is mingw, cygwin, or some other w32 environment. Relies on a
# correctly configured wine environment available, with the winepath program
# in $build's $PATH.
#
# ARG is the $build file name to be converted to w32 format.
# Result is available in $func_convert_core_file_wine_to_w32_result, and will
# be empty on error (or when ARG is empty)
func_convert_core_file_wine_to_w32 ()
{
  $debug_cmd

  func_convert_core_file_wine_to_w32_result=$1
  if test -n "$1"; then
    # Unfortunately, winepath does not exit with a non-zero error code, so we
    # are forced to check the contents of stdout. On the other hand, if the
    # command is not found, the shell will set an exit code of 127 and print
    # *an error message* to stdout. So we must check for both error code of
    # zero AND non-empty stdout, which explains the odd construction:
    func_convert_core_file_wine_to_w32_tmp=`winepath -w "$1" 2>/dev/null`
    if test "$?" -eq 0 && test -n "$func_convert_core_file_wine_to_w32_tmp"; then
      func_convert_core_file_wine_to_w32_result=`$ECHO "$func_convert_core_file_wine_to_w32_tmp" |
        $SED -e "$sed_naive_backslashify"`
    else
      func_convert_core_file_wine_to_w32_result=
    fi
  fi
}
# end: func_convert_core_file_wine_to_w32


# func_convert_core_path_wine_to_w32 ARG
# Helper function used by path conversion functions when $build is *nix, and
# $host is mingw, cygwin, or some other w32 environment. Relies on a correctly
# configured wine environment available, with the winepath program in $build's
# $PATH. Assumes ARG has no leading or trailing path separator characters.
#
# ARG is path to be converted from $build format to win32.
# Result is available in $func_convert_core_path_wine_to_w32_result.
# Unconvertible file (directory) names in ARG are skipped; if no directory names
# are convertible, then the result may be empty.
func_convert_core_path_wine_to_w32 ()
{
  $debug_cmd

  # unfortunately, winepath doesn't convert paths, only file names
  func_convert_core_path_wine_to_w32_result=
  if test -n "$1"; then
    oldIFS=$IFS
    IFS=:
    for func_convert_core_path_wine_to_w32_f in $1; do
      IFS=$oldIFS
      func_convert_core_file_wine_to_w32 "$func_convert_core_path_wine_to_w32_f"
      if test -n "$func_convert_core_file_wine_to_w32_result"; then
        if test -z "$func_convert_core_path_wine_to_w32_result"; then
          func_convert_core_path_wine_to_w32_result=$func_convert_core_file_wine_to_w32_result
        else
          func_append func_convert_core_path_wine_to_w32_result ";$func_convert_core_file_wine_to_w32_result"
        fi
      fi
    done
    IFS=$oldIFS
  fi
}
# end: func_convert_core_path_wine_to_w32


# func_cygpath ARGS...
# Wrapper around calling the cygpath program via LT_CYGPATH. This is used when
# when (1) $build is *nix and Cygwin is hosted via a wine environment; or (2)
# $build is MSYS and $host is Cygwin, or (3) $build is Cygwin. In case (1) or
# (2), returns the Cygwin file name or path in func_cygpath_result (input
# file name or path is assumed to be in w32 format, as previously converted
# from $build's *nix or MSYS format). In case (3), returns the w32 file name
# or path in func_cygpath_result (input file name or path is assumed to be in
# Cygwin format). Returns an empty string on error.
#
# ARGS are passed to cygpath, with the last one being the file name or path to
# be converted.
#
# Specify the absolute *nix (or w32) name to cygpath in the LT_CYGPATH
# environment variable; do not put it in $PATH.
func_cygpath ()
{
  $debug_cmd

  if test -n "$LT_CYGPATH" && test -f "$LT_CYGPATH"; then
    func_cygpath_result=`$LT_CYGPATH "$@" 2>/dev/null`
    if test "$?" -ne 0; then
      # on failure, ensure result is empty
      func_cygpath_result=
    fi
  else
    func_cygpath_result=
    func_error "LT_CYGPATH is empty or specifies non-existent file: '$LT_CYGPATH'"
  fi
}
#end: func_cygpath


# func_convert_core_msys_to_w32 ARG
# Convert file name or path ARG from MSYS format to w32 format.  Return
# result in func_convert_core_msys_to_w32_result.
func_convert_core_msys_to_w32 ()
{
  $debug_cmd

  # awkward: cmd appends spaces to result
  func_convert_core_msys_to_w32_result=`( cmd //c echo "$1" ) 2>/dev/null |
    $SED -e 's/[ ]*$//' -e "$sed_naive_backslashify"`
}
#end: func_convert_core_msys_to_w32


# func_convert_file_check ARG1 ARG2
# Verify that ARG1 (a file name in $build format) was converted to $host
# format in ARG2. Otherwise, emit an error message, but continue (resetting
# func_to_host_file_result to ARG1).
func_convert_file_check ()
{
  $debug_cmd

  if test -z "$2" && test -n "$1"; then
    func_error "Could not determine host file name corresponding to"
    func_error "  '$1'"
    func_error "Continuing, but uninstalled executables may not work."
    # Fallback:
    func_to_host_file_result=$1
  fi
}
# end func_convert_file_check


# func_convert_path_check FROM_PATHSEP TO_PATHSEP FROM_PATH TO_PATH
# Verify that FROM_PATH (a path in $build format) was converted to $host
# format in TO_PATH. Otherwise, emit an error message, but continue, resetting
# func_to_host_file_result to a simplistic fallback value (see below).
func_convert_path_check ()
{
  $debug_cmd

  if test -z "$4" && test -n "$3"; then
    func_error "Could not determine the host path corresponding to"
    func_error "  '$3'"
    func_error "Continuing, but uninstalled executables may not work."
    # Fallback.  This is a deliberately simplistic "conversion" and
    # should not be "improved".  See libtool.info.
    if test "x$1" != "x$2"; then
      lt_replace_pathsep_chars="s|$1|$2|g"
      func_to_host_path_result=`echo "$3" |
        $SED -e "$lt_replace_pathsep_chars"`
    else
      func_to_host_path_result=$3
    fi
  fi
}
# end func_convert_path_check


# func_convert_path_front_back_pathsep FRONTPAT BACKPAT REPL ORIG
# Modifies func_to_host_path_result by prepending REPL if ORIG matches FRONTPAT
# and appending REPL if ORIG matches BACKPAT.
func_convert_path_front_back_pathsep ()
{
  $debug_cmd

  case $4 in
  $1 ) func_to_host_path_result=$3$func_to_host_path_result
    ;;
  esac
  case $4 in
  $2 ) func_append func_to_host_path_result "$3"
    ;;
  esac
}
# end func_convert_path_front_back_pathsep


##################################################
# $build to $host FILE NAME CONVERSION FUNCTIONS #
##################################################
# invoked via '$to_host_file_cmd ARG'
#
# In each case, ARG is the path to be converted from $build to $host format.
# Result will be available in $func_to_host_file_result.


# func_to_host_file ARG
# Converts the file name ARG from $build format to $host format. Return result
# in func_to_host_file_result.
func_to_host_file ()
{
  $debug_cmd

  $to_host_file_cmd "$1"
}
# end func_to_host_file


# func_to_tool_file ARG LAZY
# converts the file name ARG from $build format to toolchain format. Return
# result in func_to_tool_file_result.  If the conversion in use is listed
# in (the comma separated) LAZY, no conversion takes place.
func_to_tool_file ()
{
  $debug_cmd

  case ,$2, in
    *,"$to_tool_file_cmd",*)
      func_to_tool_file_result=$1
      ;;
    *)
      $to_tool_file_cmd "$1"
      func_to_tool_file_result=$func_to_host_file_result
      ;;
  esac
}
# end func_to_tool_file


# func_convert_file_noop ARG
# Copy ARG to func_to_host_file_result.
func_convert_file_noop ()
{
  func_to_host_file_result=$1
}
# end func_convert_file_noop


# func_convert_file_msys_to_w32 ARG
# Convert file name ARG from (mingw) MSYS to (mingw) w32 format; automatic
# conversion to w32 is not available inside the cwrapper.  Returns result in
# func_to_host_file_result.
func_convert_file_msys_to_w32 ()
{
  $debug_cmd

  func_to_host_file_result=$1
  if test -n "$1"; then
    func_convert_core_msys_to_w32 "$1"
    func_to_host_file_result=$func_convert_core_msys_to_w32_result
  fi
  func_convert_file_check "$1" "$func_to_host_file_result"
}
# end func_convert_file_msys_to_w32


# func_convert_file_cygwin_to_w32 ARG
# Convert file name ARG from Cygwin to w32 format.  Returns result in
# func_to_host_file_result.
func_convert_file_cygwin_to_w32 ()
{
  $debug_cmd

  func_to_host_file_result=$1
  if test -n "$1"; then
    # because $build is cygwin, we call "the" cygpath in $PATH; no need to use
    # LT_CYGPATH in this case.
    func_to_host_file_result=`cygpath -m "$1"`
  fi
  func_convert_file_check "$1" "$func_to_host_file_result"
}
# end func_convert_file_cygwin_to_w32


# func_convert_file_nix_to_w32 ARG
# Convert file name ARG from *nix to w32 format.  Requires a wine environment
# and a working winepath. Returns result in func_to_host_file_result.
func_convert_file_nix_to_w32 ()
{
  $debug_cmd

  func_to_host_file_result=$1
  if test -n "$1"; then
    func_convert_core_file_wine_to_w32 "$1"
    func_to_host_file_result=$func_convert_core_file_wine_to_w32_result
  fi
  func_convert_file_check "$1" "$func_to_host_file_result"
}
# end func_convert_file_nix_to_w32


# func_convert_file_msys_to_cygwin ARG
# Convert file name ARG from MSYS to Cygwin format.  Requires LT_CYGPATH set.
# Returns result in func_to_host_file_result.
func_convert_file_msys_to_cygwin ()
{
  $debug_cmd

  func_to_host_file_result=$1
  if test -n "$1"; then
    func_convert_core_msys_to_w32 "$1"
    func_cygpath -u "$func_convert_core_msys_to_w32_result"
    func_to_host_file_result=$func_cygpath_result
  fi
  func_convert_file_check "$1" "$func_to_host_file_result"
}
# end func_convert_file_msys_to_cygwin


# func_convert_file_nix_to_cygwin ARG
# Convert file name ARG from *nix to Cygwin format.  Requires Cygwin installed
# in a wine environment, working winepath, and LT_CYGPATH set.  Returns result
# in func_to_host_file_result.
func_convert_file_nix_to_cygwin ()
{
  $debug_cmd

  func_to_host_file_result=$1
  if test -n "$1"; then
    # convert from *nix to w32, then use cygpath to convert from w32 to cygwin.
    func_convert_core_file_wine_to_w32 "$1"
    func_cygpath -u "$func_convert_core_file_wine_to_w32_result"
    func_to_host_file_result=$func_cygpath_result
  fi
  func_convert_file_check "$1" "$func_to_host_file_result"
}
# end func_convert_file_nix_to_cygwin


#############################################
# $build to $host PATH CONVERSION FUNCTIONS #
#############################################
# invoked via '$to_host_path_cmd ARG'
#
# In each case, ARG is the path to be converted from $build to $host format.
# The result will be available in $func_to_host_path_result.
#
# Path separators are also converted from $build format to $host format.  If
# ARG begins or ends with a path separator character, it is preserved (but
# converted to $host format) on output.
#
# All path conversion functions are named using the following convention:
#   file name conversion function    : func_convert_file_X_to_Y ()
#   path conversion function         : func_convert_path_X_to_Y ()
# where, for any given $build/$host combination the 'X_to_Y' value is the
# same.  If conversion functions are added for new $build/$host combinations,
# the two new functions must follow this pattern, or func_init_to_host_path_cmd
# will break.


# func_init_to_host_path_cmd
# Ensures that function "pointer" variable $to_host_path_cmd is set to the
# appropriate value, based on the value of $to_host_file_cmd.
to_host_path_cmd=
func_init_to_host_path_cmd ()
{
  $debug_cmd

  if test -z "$to_host_path_cmd"; then
    func_stripname 'func_convert_file_' '' "$to_host_file_cmd"
    to_host_path_cmd=func_convert_path_$func_stripname_result
  fi
}


# func_to_host_path ARG
# Converts the path ARG from $build format to $host format. Return result
# in func_to_host_path_result.
func_to_host_path ()
{
  $debug_cmd

  func_init_to_host_path_cmd
  $to_host_path_cmd "$1"
}
# end func_to_host_path


# func_convert_path_noop ARG
# Copy ARG to func_to_host_path_result.
func_convert_path_noop ()
{
  func_to_host_path_result=$1
}
# end func_convert_path_noop


# func_convert_path_msys_to_w32 ARG
# Convert path ARG from (mingw) MSYS to (mingw) w32 format; automatic
# conversion to w32 is not available inside the cwrapper.  Returns result in
# func_to_host_path_result.
func_convert_path_msys_to_w32 ()
{
  $debug_cmd

  func_to_host_path_result=$1
  if test -n "$1"; then
    # Remove leading and trailing path separator characters from ARG.  MSYS
    # behavior is inconsistent here; cygpath turns them into '.;' and ';.';
    # and winepath ignores them completely.
    func_stripname : : "$1"
    func_to_host_path_tmp1=$func_stripname_result
    func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
    func_to_host_path_result=$func_convert_core_msys_to_w32_result
    func_convert_path_check : ";" \
      "$func_to_host_path_tmp1" "$func_to_host_path_result"
    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
  fi
}
# end func_convert_path_msys_to_w32


# func_convert_path_cygwin_to_w32 ARG
# Convert path ARG from Cygwin to w32 format.  Returns result in
# func_to_host_file_result.
func_convert_path_cygwin_to_w32 ()
{
  $debug_cmd

  func_to_host_path_result=$1
  if test -n "$1"; then
    # See func_convert_path_msys_to_w32:
    func_stripname : : "$1"
    func_to_host_path_tmp1=$func_stripname_result
    func_to_host_path_result=`cygpath -m -p "$func_to_host_path_tmp1"`
    func_convert_path_check : ";" \
      "$func_to_host_path_tmp1" "$func_to_host_path_result"
    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
  fi
}
# end func_convert_path_cygwin_to_w32


# func_convert_path_nix_to_w32 ARG
# Convert path ARG from *nix to w32 format.  Requires a wine environment and
# a working winepath.  Returns result in func_to_host_file_result.
func_convert_path_nix_to_w32 ()
{
  $debug_cmd

  func_to_host_path_result=$1
  if test -n "$1"; then
    # See func_convert_path_msys_to_w32:
    func_stripname : : "$1"
    func_to_host_path_tmp1=$func_stripname_result
    func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
    func_to_host_path_result=$func_convert_core_path_wine_to_w32_result
    func_convert_path_check : ";" \
      "$func_to_host_path_tmp1" "$func_to_host_path_result"
    func_convert_path_front_back_pathsep ":*" "*:" ";" "$1"
  fi
}
# end func_convert_path_nix_to_w32


# func_convert_path_msys_to_cygwin ARG
# Convert path ARG from MSYS to Cygwin format.  Requires LT_CYGPATH set.
# Returns result in func_to_host_file_result.
func_convert_path_msys_to_cygwin ()
{
  $debug_cmd

  func_to_host_path_result=$1
  if test -n "$1"; then
    # See func_convert_path_msys_to_w32:
    func_stripname : : "$1"
    func_to_host_path_tmp1=$func_stripname_result
    func_convert_core_msys_to_w32 "$func_to_host_path_tmp1"
    func_cygpath -u -p "$func_convert_core_msys_to_w32_result"
    func_to_host_path_result=$func_cygpath_result
    func_convert_path_check : : \
      "$func_to_host_path_tmp1" "$func_to_host_path_result"
    func_convert_path_front_back_pathsep ":*" "*:" : "$1"
  fi
}
# end func_convert_path_msys_to_cygwin


# func_convert_path_nix_to_cygwin ARG
# Convert path ARG from *nix to Cygwin format.  Requires Cygwin installed in a
# a wine environment, working winepath, and LT_CYGPATH set.  Returns result in
# func_to_host_file_result.
func_convert_path_nix_to_cygwin ()
{
  $debug_cmd

  func_to_host_path_result=$1
  if test -n "$1"; then
    # Remove leading and trailing path separator characters from
    # ARG. msys behavior is inconsistent here, cygpath turns them
    # into '.;' and ';.', and winepath ignores them completely.
    func_stripname : : "$1"
    func_to_host_path_tmp1=$func_stripname_result
    func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1"
    func_cygpath -u -p "$func_convert_core_path_wine_to_w32_result"
    func_to_host_path_result=$func_cygpath_result
    func_convert_path_check : : \
      "$func_to_host_path_tmp1" "$func_to_host_path_result"
    func_convert_path_front_back_pathsep ":*" "*:" : "$1"
  fi
}
# end func_convert_path_nix_to_cygwin


# func_dll_def_p FILE
# True iff FILE is a Windows DLL '.def' file.
# Keep in sync with _LT_DLL_DEF_P in libtool.m4
func_dll_def_p ()
{
  $debug_cmd

  func_dll_def_p_tmp=`$SED -n \
    -e 's/^[	 ]*//' \
    -e '/^\(;.*\)*$/d' \
    -e 's/^\(EXPORTS\|LIBRARY\)\([	 ].*\)*$/DEF/p' \
    -e q \
    "$1"`
  test DEF = "$func_dll_def_p_tmp"
}


# func_mode_compile arg...
func_mode_compile ()
{
    $debug_cmd

    # Get the compilation command and the source file.
    base_compile=
    srcfile=$nonopt  #  always keep a non-empty value in "srcfile"
    suppress_opt=yes
    suppress_output=
    arg_mode=normal
    libobj=
    later=
    pie_flag=

    for arg
    do
      case $arg_mode in
      arg  )
	# do not "continue".  Instead, add this to base_compile
	lastarg=$arg
	arg_mode=normal
	;;

      target )
	libobj=$arg
	arg_mode=normal
	continue
	;;

      normal )
	# Accept any command-line options.
	case $arg in
	-o)
	  test -n "$libobj" && \
	    func_fatal_error "you cannot specify '-o' more than once"
	  arg_mode=target
	  continue
	  ;;

	-pie | -fpie | -fPIE)
          func_append pie_flag " $arg"
	  continue
	  ;;

	-shared | -static | -prefer-pic | -prefer-non-pic)
	  func_append later " $arg"
	  continue
	  ;;

	-no-suppress)
	  suppress_opt=no
	  continue
	  ;;

	-Xcompiler)
	  arg_mode=arg  #  the next one goes into the "base_compile" arg list
	  continue      #  The current "srcfile" will either be retained or
	  ;;            #  replaced later.  I would guess that would be a bug.

	-Wc,*)
	  func_stripname '-Wc,' '' "$arg"
	  args=$func_stripname_result
	  lastarg=
	  save_ifs=$IFS; IFS=,
	  for arg in $args; do
	    IFS=$save_ifs
	    func_append_quoted lastarg "$arg"
	  done
	  IFS=$save_ifs
	  func_stripname ' ' '' "$lastarg"
	  lastarg=$func_stripname_result

	  # Add the arguments to base_compile.
	  func_append base_compile " $lastarg"
	  continue
	  ;;

	*)
	  # Accept the current argument as the source file.
	  # The previous "srcfile" becomes the current argument.
	  #
	  lastarg=$srcfile
	  srcfile=$arg
	  ;;
	esac  #  case $arg
	;;
      esac    #  case $arg_mode

      # Aesthetically quote the previous argument.
      func_append_quoted base_compile "$lastarg"
    done # for arg

    case $arg_mode in
    arg)
      func_fatal_error "you must specify an argument for -Xcompile"
      ;;
    target)
      func_fatal_error "you must specify a target with '-o'"
      ;;
    *)
      # Get the name of the library object.
      test -z "$libobj" && {
	func_basename "$srcfile"
	libobj=$func_basename_result
      }
      ;;
    esac

    # Recognize several different file suffixes.
    # If the user specifies -o file.o, it is replaced with file.lo
    case $libobj in
    *.[cCFSifmso] | \
    *.ada | *.adb | *.ads | *.asm | \
    *.c++ | *.cc | *.ii | *.class | *.cpp | *.cxx | \
    *.[fF][09]? | *.for | *.java | *.go | *.obj | *.sx | *.cu | *.cup)
      func_xform "$libobj"
      libobj=$func_xform_result
      ;;
    esac

    case $libobj in
    *.lo) func_lo2o "$libobj"; obj=$func_lo2o_result ;;
    *)
      func_fatal_error "cannot determine name of library object from '$libobj'"
      ;;
    esac

    func_infer_tag $base_compile

    for arg in $later; do
      case $arg in
      -shared)
	test yes = "$build_libtool_libs" \
	  || func_fatal_configuration "cannot build a shared library"
	build_old_libs=no
	continue
	;;

      -static)
	build_libtool_libs=no
	build_old_libs=yes
	continue
	;;

      -prefer-pic)
	pic_mode=yes
	continue
	;;

      -prefer-non-pic)
	pic_mode=no
	continue
	;;
      esac
    done

    func_quote_for_eval "$libobj"
    test "X$libobj" != "X$func_quote_for_eval_result" \
      && $ECHO "X$libobj" | $GREP '[]~#^*{};<>?"'"'"'	 &()|`$[]' \
      && func_warning "libobj name '$libobj' may not contain shell special characters."
    func_dirname_and_basename "$obj" "/" ""
    objname=$func_basename_result
    xdir=$func_dirname_result
    lobj=$xdir$objdir/$objname

    test -z "$base_compile" && \
      func_fatal_help "you must specify a compilation command"

    # Delete any leftover library objects.
    if test yes = "$build_old_libs"; then
      removelist="$obj $lobj $libobj ${libobj}T"
    else
      removelist="$lobj $libobj ${libobj}T"
    fi

    # On Cygwin there's no "real" PIC flag so we must build both object types
    case $host_os in
    cygwin* | mingw* | pw32* | os2* | cegcc*)
      pic_mode=default
      ;;
    esac
    if test no = "$pic_mode" && test pass_all != "$deplibs_check_method"; then
      # non-PIC code in shared libraries is not supported
      pic_mode=default
    fi

    # Calculate the filename of the output object if compiler does
    # not support -o with -c
    if test no = "$compiler_c_o"; then
      output_obj=`$ECHO "$srcfile" | $SED 's%^.*/%%; s%\.[^.]*$%%'`.$objext
      lockfile=$output_obj.lock
    else
      output_obj=
      need_locks=no
      lockfile=
    fi

    # Lock this critical section if it is needed
    # We use this script file to make the link, it avoids creating a new file
    if test yes = "$need_locks"; then
      until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
	func_echo "Waiting for $lockfile to be removed"
	sleep 2
      done
    elif test warn = "$need_locks"; then
      if test -f "$lockfile"; then
	$ECHO "\
*** ERROR, $lockfile exists and contains:
`cat $lockfile 2>/dev/null`

This indicates that another process is trying to use the same
temporary object file, and libtool could not work around it because
your compiler does not support '-c' and '-o' together.  If you
repeat this compilation, it may succeed, by chance, but you had better
avoid parallel builds (make -j) in this platform, or get a better
compiler."

	$opt_dry_run || $RM $removelist
	exit $EXIT_FAILURE
      fi
      func_append removelist " $output_obj"
      $ECHO "$srcfile" > "$lockfile"
    fi

    $opt_dry_run || $RM $removelist
    func_append removelist " $lockfile"
    trap '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE' 1 2 15

    func_to_tool_file "$srcfile" func_convert_file_msys_to_w32
    srcfile=$func_to_tool_file_result
    func_quote_for_eval "$srcfile"
    qsrcfile=$func_quote_for_eval_result

    # Only build a PIC object if we are building libtool libraries.
    if test yes = "$build_libtool_libs"; then
      # Without this assignment, base_compile gets emptied.
      fbsd_hideous_sh_bug=$base_compile

      if test no != "$pic_mode"; then
	command="$base_compile $qsrcfile $pic_flag"
      else
	# Don't build PIC code
	command="$base_compile $qsrcfile"
      fi

      func_mkdir_p "$xdir$objdir"

      if test -z "$output_obj"; then
	# Place PIC objects in $objdir
	func_append command " -o $lobj"
      fi

      func_show_eval_locale "$command"	\
          'test -n "$output_obj" && $RM $removelist; exit $EXIT_FAILURE'

      if test warn = "$need_locks" &&
	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
	$ECHO "\
*** ERROR, $lockfile contains:
`cat $lockfile 2>/dev/null`

but it should contain:
$srcfile

This indicates that another process is trying to use the same
temporary object file, and libtool could not work around it because
your compiler does not support '-c' and '-o' together.  If you
repeat this compilation, it may succeed, by chance, but you had better
avoid parallel builds (make -j) in this platform, or get a better
compiler."

	$opt_dry_run || $RM $removelist
	exit $EXIT_FAILURE
      fi

      # Just move the object if needed, then go on to compile the next one
      if test -n "$output_obj" && test "X$output_obj" != "X$lobj"; then
	func_show_eval '$MV "$output_obj" "$lobj"' \
	  'error=$?; $opt_dry_run || $RM $removelist; exit $error'
      fi

      # Allow error messages only from the first compilation.
      if test yes = "$suppress_opt"; then
	suppress_output=' >/dev/null 2>&1'
      fi
    fi

    # Only build a position-dependent object if we build old libraries.
    if test yes = "$build_old_libs"; then
      if test yes != "$pic_mode"; then
	# Don't build PIC code
	command="$base_compile $qsrcfile$pie_flag"
      else
	command="$base_compile $qsrcfile $pic_flag"
      fi
      if test yes = "$compiler_c_o"; then
	func_append command " -o $obj"
      fi

      # Suppress compiler output if we already did a PIC compilation.
      func_append command "$suppress_output"
      func_show_eval_locale "$command" \
        '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE'

      if test warn = "$need_locks" &&
	 test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then
	$ECHO "\
*** ERROR, $lockfile contains:
`cat $lockfile 2>/dev/null`

but it should contain:
$srcfile

This indicates that another process is trying to use the same
temporary object file, and libtool could not work around it because
your compiler does not support '-c' and '-o' together.  If you
repeat this compilation, it may succeed, by chance, but you had better
avoid parallel builds (make -j) in this platform, or get a better
compiler."

	$opt_dry_run || $RM $removelist
	exit $EXIT_FAILURE
      fi

      # Just move the object if needed
      if test -n "$output_obj" && test "X$output_obj" != "X$obj"; then
	func_show_eval '$MV "$output_obj" "$obj"' \
	  'error=$?; $opt_dry_run || $RM $removelist; exit $error'
      fi
    fi

    $opt_dry_run || {
      func_write_libtool_object "$libobj" "$objdir/$objname" "$objname"

      # Unlock the critical section if it was locked
      if test no != "$need_locks"; then
	removelist=$lockfile
        $RM "$lockfile"
      fi
    }

    exit $EXIT_SUCCESS
}

$opt_help || {
  test compile = "$opt_mode" && func_mode_compile ${1+"$@"}
}

func_mode_help ()
{
    # We need to display help for each of the modes.
    case $opt_mode in
      "")
        # Generic help is extracted from the usage comments
        # at the start of this file.
        func_help
        ;;

      clean)
        $ECHO \
"Usage: $progname [OPTION]... --mode=clean RM [RM-OPTION]... FILE...

Remove files from the build directory.

RM is the name of the program to use to delete files associated with each FILE
(typically '/bin/rm').  RM-OPTIONS are options (such as '-f') to be passed
to RM.

If FILE is a libtool library, object or program, all the files associated
with it are deleted. Otherwise, only FILE itself is deleted using RM."
        ;;

      compile)
      $ECHO \
"Usage: $progname [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE

Compile a source file into a libtool library object.

This mode accepts the following additional options:

  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
  -no-suppress      do not suppress compiler output for multiple passes
  -prefer-pic       try to build PIC objects only
  -prefer-non-pic   try to build non-PIC objects only
  -shared           do not build a '.o' file suitable for static linking
  -static           only build a '.o' file suitable for static linking
  -Wc,FLAG          pass FLAG directly to the compiler

COMPILE-COMMAND is a command to be used in creating a 'standard' object file
from the given SOURCEFILE.

The output file name is determined by removing the directory component from
SOURCEFILE, then substituting the C source code suffix '.c' with the
library object suffix, '.lo'."
        ;;

      execute)
        $ECHO \
"Usage: $progname [OPTION]... --mode=execute COMMAND [ARGS]...

Automatically set library path, then run a program.

This mode accepts the following additional options:

  -dlopen FILE      add the directory containing FILE to the library path

This mode sets the library path environment variable according to '-dlopen'
flags.

If any of the ARGS are libtool executable wrappers, then they are translated
into their corresponding uninstalled binary, and any of their required library
directories are added to the library path.

Then, COMMAND is executed, with ARGS as arguments."
        ;;

      finish)
        $ECHO \
"Usage: $progname [OPTION]... --mode=finish [LIBDIR]...

Complete the installation of libtool libraries.

Each LIBDIR is a directory that contains libtool libraries.

The commands that this mode executes may require superuser privileges.  Use
the '--dry-run' option if you just want to see what would be executed."
        ;;

      install)
        $ECHO \
"Usage: $progname [OPTION]... --mode=install INSTALL-COMMAND...

Install executables or libraries.

INSTALL-COMMAND is the installation command.  The first component should be
either the 'install' or 'cp' program.

The following components of INSTALL-COMMAND are treated specially:

  -inst-prefix-dir PREFIX-DIR  Use PREFIX-DIR as a staging area for installation

The rest of the components are interpreted as arguments to that command (only
BSD-compatible install options are recognized)."
        ;;

      link)
        $ECHO \
"Usage: $progname [OPTION]... --mode=link LINK-COMMAND...

Link object files or libraries together to form another library, or to
create an executable program.

LINK-COMMAND is a command using the C compiler that you would use to create
a program from several object files.

The following components of LINK-COMMAND are treated specially:

  -all-static       do not do any dynamic linking at all
  -avoid-version    do not add a version suffix if possible
  -bindir BINDIR    specify path to binaries directory (for systems where
                    libraries must be found in the PATH setting at runtime)
  -dlopen FILE      '-dlpreopen' FILE if it cannot be dlopened at runtime
  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
  -export-symbols SYMFILE
                    try to export only the symbols listed in SYMFILE
  -export-symbols-regex REGEX
                    try to export only the symbols matching REGEX
  -LLIBDIR          search LIBDIR for required installed libraries
  -lNAME            OUTPUT-FILE requires the installed library libNAME
  -module           build a library that can dlopened
  -no-fast-install  disable the fast-install mode
  -no-install       link a not-installable executable
  -no-undefined     declare that a library does not refer to external symbols
  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
  -objectlist FILE  use a list of object files found in FILE to specify objects
  -os2dllname NAME  force a short DLL name on OS/2 (no effect on other OSes)
  -precious-files-regex REGEX
                    don't remove output files matching REGEX
  -release RELEASE  specify package release information
  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
  -shared           only do dynamic linking of libtool libraries
  -shrext SUFFIX    override the standard shared library file extension
  -static           do not do any dynamic linking of uninstalled libtool libraries
  -static-libtool-libs
                    do not do any dynamic linking of libtool libraries
  -version-info CURRENT[:REVISION[:AGE]]
                    specify library version info [each variable defaults to 0]
  -weak LIBNAME     declare that the target provides the LIBNAME interface
  -Wc,FLAG
  -Xcompiler FLAG   pass linker-specific FLAG directly to the compiler
  -Wl,FLAG
  -Xlinker FLAG     pass linker-specific FLAG directly to the linker
  -XCClinker FLAG   pass link-specific FLAG to the compiler driver (CC)

All other options (arguments beginning with '-') are ignored.

Every other argument is treated as a filename.  Files ending in '.la' are
treated as uninstalled libtool libraries, other files are standard or library
object files.

If the OUTPUT-FILE ends in '.la', then a libtool library is created,
only library objects ('.lo' files) may be specified, and '-rpath' is
required, except when creating a convenience library.

If OUTPUT-FILE ends in '.a' or '.lib', then a standard library is created
using 'ar' and 'ranlib', or on Windows using 'lib'.

If OUTPUT-FILE ends in '.lo' or '.$objext', then a reloadable object file
is created, otherwise an executable program is created."
        ;;

      uninstall)
        $ECHO \
"Usage: $progname [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...

Remove libraries from an installation directory.

RM is the name of the program to use to delete files associated with each FILE
(typically '/bin/rm').  RM-OPTIONS are options (such as '-f') to be passed
to RM.

If FILE is a libtool library, all the files associated with it are deleted.
Otherwise, only FILE itself is deleted using RM."
        ;;

      *)
        func_fatal_help "invalid operation mode '$opt_mode'"
        ;;
    esac

    echo
    $ECHO "Try '$progname --help' for more information about other modes."
}

# Now that we've collected a possible --mode arg, show help if necessary
if $opt_help; then
  if test : = "$opt_help"; then
    func_mode_help
  else
    {
      func_help noexit
      for opt_mode in compile link execute install finish uninstall clean; do
	func_mode_help
      done
    } | $SED -n '1p; 2,$s/^Usage:/  or: /p'
    {
      func_help noexit
      for opt_mode in compile link execute install finish uninstall clean; do
	echo
	func_mode_help
      done
    } |
    $SED '1d
      /^When reporting/,/^Report/{
	H
	d
      }
      $x
      /information about other modes/d
      /more detailed .*MODE/d
      s/^Usage:.*--mode=\([^ ]*\) .*/Description of \1 mode:/'
  fi
  exit $?
fi


# func_mode_execute arg...
func_mode_execute ()
{
    $debug_cmd

    # The first argument is the command name.
    cmd=$nonopt
    test -z "$cmd" && \
      func_fatal_help "you must specify a COMMAND"

    # Handle -dlopen flags immediately.
    for file in $opt_dlopen; do
      test -f "$file" \
	|| func_fatal_help "'$file' is not a file"

      dir=
      case $file in
      *.la)
	func_resolve_sysroot "$file"
	file=$func_resolve_sysroot_result

	# Check to see that this really is a libtool archive.
	func_lalib_unsafe_p "$file" \
	  || func_fatal_help "'$lib' is not a valid libtool archive"

	# Read the libtool library.
	dlname=
	library_names=
	func_source "$file"

	# Skip this library if it cannot be dlopened.
	if test -z "$dlname"; then
	  # Warn if it was a shared library.
	  test -n "$library_names" && \
	    func_warning "'$file' was not linked with '-export-dynamic'"
	  continue
	fi

	func_dirname "$file" "" "."
	dir=$func_dirname_result

	if test -f "$dir/$objdir/$dlname"; then
	  func_append dir "/$objdir"
	else
	  if test ! -f "$dir/$dlname"; then
	    func_fatal_error "cannot find '$dlname' in '$dir' or '$dir/$objdir'"
	  fi
	fi
	;;

      *.lo)
	# Just add the directory containing the .lo file.
	func_dirname "$file" "" "."
	dir=$func_dirname_result
	;;

      *)
	func_warning "'-dlopen' is ignored for non-libtool libraries and objects"
	continue
	;;
      esac

      # Get the absolute pathname.
      absdir=`cd "$dir" && pwd`
      test -n "$absdir" && dir=$absdir

      # Now add the directory to shlibpath_var.
      if eval "test -z \"\$$shlibpath_var\""; then
	eval "$shlibpath_var=\"\$dir\""
      else
	eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
      fi
    done

    # This variable tells wrapper scripts just to set shlibpath_var
    # rather than running their programs.
    libtool_execute_magic=$magic

    # Check if any of the arguments is a wrapper script.
    args=
    for file
    do
      case $file in
      -* | *.la | *.lo ) ;;
      *)
	# Do a test to see if this is really a libtool program.
	if func_ltwrapper_script_p "$file"; then
	  func_source "$file"
	  # Transform arg to wrapped name.
	  file=$progdir/$program
	elif func_ltwrapper_executable_p "$file"; then
	  func_ltwrapper_scriptname "$file"
	  func_source "$func_ltwrapper_scriptname_result"
	  # Transform arg to wrapped name.
	  file=$progdir/$program
	fi
	;;
      esac
      # Quote arguments (to preserve shell metacharacters).
      func_append_quoted args "$file"
    done

    if $opt_dry_run; then
      # Display what would be done.
      if test -n "$shlibpath_var"; then
	eval "\$ECHO \"\$shlibpath_var=\$$shlibpath_var\""
	echo "export $shlibpath_var"
      fi
      $ECHO "$cmd$args"
      exit $EXIT_SUCCESS
    else
      if test -n "$shlibpath_var"; then
	# Export the shlibpath_var.
	eval "export $shlibpath_var"
      fi

      # Restore saved environment variables
      for lt_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES
      do
	eval "if test \"\${save_$lt_var+set}\" = set; then
                $lt_var=\$save_$lt_var; export $lt_var
	      else
		$lt_unset $lt_var
	      fi"
      done

      # Now prepare to actually exec the command.
      exec_cmd=\$cmd$args
    fi
}

test execute = "$opt_mode" && func_mode_execute ${1+"$@"}


# func_mode_finish arg...
func_mode_finish ()
{
    $debug_cmd

    libs=
    libdirs=
    admincmds=

    for opt in "$nonopt" ${1+"$@"}
    do
      if test -d "$opt"; then
	func_append libdirs " $opt"

      elif test -f "$opt"; then
	if func_lalib_unsafe_p "$opt"; then
	  func_append libs " $opt"
	else
	  func_warning "'$opt' is not a valid libtool archive"
	fi

      else
	func_fatal_error "invalid argument '$opt'"
      fi
    done

    if test -n "$libs"; then
      if test -n "$lt_sysroot"; then
        sysroot_regex=`$ECHO "$lt_sysroot" | $SED "$sed_make_literal_regex"`
        sysroot_cmd="s/\([ ']\)$sysroot_regex/\1/g;"
      else
        sysroot_cmd=
      fi

      # Remove sysroot references
      if $opt_dry_run; then
        for lib in $libs; do
          echo "removing references to $lt_sysroot and '=' prefixes from $lib"
        done
      else
        tmpdir=`func_mktempdir`
        for lib in $libs; do
	  $SED -e "$sysroot_cmd s/\([ ']-[LR]\)=/\1/g; s/\([ ']\)=/\1/g" $lib \
	    > $tmpdir/tmp-la
	  mv -f $tmpdir/tmp-la $lib
	done
        ${RM}r "$tmpdir"
      fi
    fi

    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
      for libdir in $libdirs; do
	if test -n "$finish_cmds"; then
	  # Do each command in the finish commands.
	  func_execute_cmds "$finish_cmds" 'admincmds="$admincmds
'"$cmd"'"'
	fi
	if test -n "$finish_eval"; then
	  # Do the single finish_eval.
	  eval cmds=\"$finish_eval\"
	  $opt_dry_run || eval "$cmds" || func_append admincmds "
       $cmds"
	fi
      done
    fi

    # Exit here if they wanted silent mode.
    $opt_quiet && exit $EXIT_SUCCESS

    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
      echo "----------------------------------------------------------------------"
      echo "Libraries have been installed in:"
      for libdir in $libdirs; do
	$ECHO "   $libdir"
      done
      echo
      echo "If you ever happen to want to link against installed libraries"
      echo "in a given directory, LIBDIR, you must either use libtool, and"
      echo "specify the full pathname of the library, or use the '-LLIBDIR'"
      echo "flag during linking and do at least one of the following:"
      if test -n "$shlibpath_var"; then
	echo "   - add LIBDIR to the '$shlibpath_var' environment variable"
	echo "     during execution"
      fi
      if test -n "$runpath_var"; then
	echo "   - add LIBDIR to the '$runpath_var' environment variable"
	echo "     during linking"
      fi
      if test -n "$hardcode_libdir_flag_spec"; then
	libdir=LIBDIR
	eval flag=\"$hardcode_libdir_flag_spec\"

	$ECHO "   - use the '$flag' linker flag"
      fi
      if test -n "$admincmds"; then
	$ECHO "   - have your system administrator run these commands:$admincmds"
      fi
      if test -f /etc/ld.so.conf; then
	echo "   - have your system administrator add LIBDIR to '/etc/ld.so.conf'"
      fi
      echo

      echo "See any operating system documentation about shared libraries for"
      case $host in
	solaris2.[6789]|solaris2.1[0-9])
	  echo "more information, such as the ld(1), crle(1) and ld.so(8) manual"
	  echo "pages."
	  ;;
	*)
	  echo "more information, such as the ld(1) and ld.so(8) manual pages."
	  ;;
      esac
      echo "----------------------------------------------------------------------"
    fi
    exit $EXIT_SUCCESS
}

test finish = "$opt_mode" && func_mode_finish ${1+"$@"}


# func_mode_install arg...
func_mode_install ()
{
    $debug_cmd

    # There may be an optional sh(1) argument at the beginning of
    # install_prog (especially on Windows NT).
    if test "$SHELL" = "$nonopt" || test /bin/sh = "$nonopt" ||
       # Allow the use of GNU shtool's install command.
       case $nonopt in *shtool*) :;; *) false;; esac
    then
      # Aesthetically quote it.
      func_quote_for_eval "$nonopt"
      install_prog="$func_quote_for_eval_result "
      arg=$1
      shift
    else
      install_prog=
      arg=$nonopt
    fi

    # The real first argument should be the name of the installation program.
    # Aesthetically quote it.
    func_quote_for_eval "$arg"
    func_append install_prog "$func_quote_for_eval_result"
    install_shared_prog=$install_prog
    case " $install_prog " in
      *[\\\ /]cp\ *) install_cp=: ;;
      *) install_cp=false ;;
    esac

    # We need to accept at least all the BSD install flags.
    dest=
    files=
    opts=
    prev=
    install_type=
    isdir=false
    stripme=
    no_mode=:
    for arg
    do
      arg2=
      if test -n "$dest"; then
	func_append files " $dest"
	dest=$arg
	continue
      fi

      case $arg in
      -d) isdir=: ;;
      -f)
	if $install_cp; then :; else
	  prev=$arg
	fi
	;;
      -g | -m | -o)
	prev=$arg
	;;
      -s)
	stripme=" -s"
	continue
	;;
      -*)
	;;
      *)
	# If the previous option needed an argument, then skip it.
	if test -n "$prev"; then
	  if test X-m = "X$prev" && test -n "$install_override_mode"; then
	    arg2=$install_override_mode
	    no_mode=false
	  fi
	  prev=
	else
	  dest=$arg
	  continue
	fi
	;;
      esac

      # Aesthetically quote the argument.
      func_quote_for_eval "$arg"
      func_append install_prog " $func_quote_for_eval_result"
      if test -n "$arg2"; then
	func_quote_for_eval "$arg2"
      fi
      func_append install_shared_prog " $func_quote_for_eval_result"
    done

    test -z "$install_prog" && \
      func_fatal_help "you must specify an install program"

    test -n "$prev" && \
      func_fatal_help "the '$prev' option requires an argument"

    if test -n "$install_override_mode" && $no_mode; then
      if $install_cp; then :; else
	func_quote_for_eval "$install_override_mode"
	func_append install_shared_prog " -m $func_quote_for_eval_result"
      fi
    fi

    if test -z "$files"; then
      if test -z "$dest"; then
	func_fatal_help "no file or destination specified"
      else
	func_fatal_help "you must specify a destination"
      fi
    fi

    # Strip any trailing slash from the destination.
    func_stripname '' '/' "$dest"
    dest=$func_stripname_result

    # Check to see that the destination is a directory.
    test -d "$dest" && isdir=:
    if $isdir; then
      destdir=$dest
      destname=
    else
      func_dirname_and_basename "$dest" "" "."
      destdir=$func_dirname_result
      destname=$func_basename_result

      # Not a directory, so check to see that there is only one file specified.
      set dummy $files; shift
      test "$#" -gt 1 && \
	func_fatal_help "'$dest' is not a directory"
    fi
    case $destdir in
    [\\/]* | [A-Za-z]:[\\/]*) ;;
    *)
      for file in $files; do
	case $file in
	*.lo) ;;
	*)
	  func_fatal_help "'$destdir' must be an absolute directory name"
	  ;;
	esac
      done
      ;;
    esac

    # This variable tells wrapper scripts just to set variables rather
    # than running their programs.
    libtool_install_magic=$magic

    staticlibs=
    future_libdirs=
    current_libdirs=
    for file in $files; do

      # Do each installation.
      case $file in
      *.$libext)
	# Do the static libraries later.
	func_append staticlibs " $file"
	;;

      *.la)
	func_resolve_sysroot "$file"
	file=$func_resolve_sysroot_result

	# Check to see that this really is a libtool archive.
	func_lalib_unsafe_p "$file" \
	  || func_fatal_help "'$file' is not a valid libtool archive"

	library_names=
	old_library=
	relink_command=
	func_source "$file"

	# Add the libdir to current_libdirs if it is the destination.
	if test "X$destdir" = "X$libdir"; then
	  case "$current_libdirs " in
	  *" $libdir "*) ;;
	  *) func_append current_libdirs " $libdir" ;;
	  esac
	else
	  # Note the libdir as a future libdir.
	  case "$future_libdirs " in
	  *" $libdir "*) ;;
	  *) func_append future_libdirs " $libdir" ;;
	  esac
	fi

	func_dirname "$file" "/" ""
	dir=$func_dirname_result
	func_append dir "$objdir"

	if test -n "$relink_command"; then
	  # Determine the prefix the user has applied to our future dir.
	  inst_prefix_dir=`$ECHO "$destdir" | $SED -e "s%$libdir\$%%"`

	  # Don't allow the user to place us outside of our expected
	  # location b/c this prevents finding dependent libraries that
	  # are installed to the same prefix.
	  # At present, this check doesn't affect windows .dll's that
	  # are installed into $libdir/../bin (currently, that works fine)
	  # but it's something to keep an eye on.
	  test "$inst_prefix_dir" = "$destdir" && \
	    func_fatal_error "error: cannot install '$file' to a directory not ending in $libdir"

	  if test -n "$inst_prefix_dir"; then
	    # Stick the inst_prefix_dir data into the link command.
	    relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%-inst-prefix-dir $inst_prefix_dir%"`
	  else
	    relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%%"`
	  fi

	  func_warning "relinking '$file'"
	  func_show_eval "$relink_command" \
	    'func_fatal_error "error: relink '\''$file'\'' with the above command before installing it"'
	fi

	# See the names of the shared library.
	set dummy $library_names; shift
	if test -n "$1"; then
	  realname=$1
	  shift

	  srcname=$realname
	  test -n "$relink_command" && srcname=${realname}T

	  # Install the shared library and build the symlinks.
	  func_show_eval "$install_shared_prog $dir/$srcname $destdir/$realname" \
	      'exit $?'
	  tstripme=$stripme
	  case $host_os in
	  cygwin* | mingw* | pw32* | cegcc*)
	    case $realname in
	    *.dll.a)
	      tstripme=
	      ;;
	    esac
	    ;;
	  os2*)
	    case $realname in
	    *_dll.a)
	      tstripme=
	      ;;
	    esac
	    ;;
	  esac
	  if test -n "$tstripme" && test -n "$striplib"; then
	    func_show_eval "$striplib $destdir/$realname" 'exit $?'
	  fi

	  if test "$#" -gt 0; then
	    # Delete the old symlinks, and create new ones.
	    # Try 'ln -sf' first, because the 'ln' binary might depend on
	    # the symlink we replace!  Solaris /bin/ln does not understand -f,
	    # so we also need to try rm && ln -s.
	    for linkname
	    do
	      test "$linkname" != "$realname" \
		&& func_show_eval "(cd $destdir && { $LN_S -f $realname $linkname || { $RM $linkname && $LN_S $realname $linkname; }; })"
	    done
	  fi

	  # Do each command in the postinstall commands.
	  lib=$destdir/$realname
	  func_execute_cmds "$postinstall_cmds" 'exit $?'
	fi

	# Install the pseudo-library for information purposes.
	func_basename "$file"
	name=$func_basename_result
	instname=$dir/${name}i
	func_show_eval "$install_prog $instname $destdir/$name" 'exit $?'

	# Maybe install the static library, too.
	test -n "$old_library" && func_append staticlibs " $dir/$old_library"
	;;

      *.lo)
	# Install (i.e. copy) a libtool object.

	# Figure out destination file name, if it wasn't already specified.
	if test -n "$destname"; then
	  destfile=$destdir/$destname
	else
	  func_basename "$file"
	  destfile=$func_basename_result
	  destfile=$destdir/$destfile
	fi

	# Deduce the name of the destination old-style object file.
	case $destfile in
	*.lo)
	  func_lo2o "$destfile"
	  staticdest=$func_lo2o_result
	  ;;
	*.$objext)
	  staticdest=$destfile
	  destfile=
	  ;;
	*)
	  func_fatal_help "cannot copy a libtool object to '$destfile'"
	  ;;
	esac

	# Install the libtool object if requested.
	test -n "$destfile" && \
	  func_show_eval "$install_prog $file $destfile" 'exit $?'

	# Install the old object if enabled.
	if test yes = "$build_old_libs"; then
	  # Deduce the name of the old-style object file.
	  func_lo2o "$file"
	  staticobj=$func_lo2o_result
	  func_show_eval "$install_prog \$staticobj \$staticdest" 'exit $?'
	fi
	exit $EXIT_SUCCESS
	;;

      *)
	# Figure out destination file name, if it wasn't already specified.
	if test -n "$destname"; then
	  destfile=$destdir/$destname
	else
	  func_basename "$file"
	  destfile=$func_basename_result
	  destfile=$destdir/$destfile
	fi

	# If the file is missing, and there is a .exe on the end, strip it
	# because it is most likely a libtool script we actually want to
	# install
	stripped_ext=
	case $file in
	  *.exe)
	    if test ! -f "$file"; then
	      func_stripname '' '.exe' "$file"
	      file=$func_stripname_result
	      stripped_ext=.exe
	    fi
	    ;;
	esac

	# Do a test to see if this is really a libtool program.
	case $host in
	*cygwin* | *mingw*)
	    if func_ltwrapper_executable_p "$file"; then
	      func_ltwrapper_scriptname "$file"
	      wrapper=$func_ltwrapper_scriptname_result
	    else
	      func_stripname '' '.exe' "$file"
	      wrapper=$func_stripname_result
	    fi
	    ;;
	*)
	    wrapper=$file
	    ;;
	esac
	if func_ltwrapper_script_p "$wrapper"; then
	  notinst_deplibs=
	  relink_command=

	  func_source "$wrapper"

	  # Check the variables that should have been set.
	  test -z "$generated_by_libtool_version" && \
	    func_fatal_error "invalid libtool wrapper script '$wrapper'"

	  finalize=:
	  for lib in $notinst_deplibs; do
	    # Check to see that each library is installed.
	    libdir=
	    if test -f "$lib"; then
	      func_source "$lib"
	    fi
	    libfile=$libdir/`$ECHO "$lib" | $SED 's%^.*/%%g'`
	    if test -n "$libdir" && test ! -f "$libfile"; then
	      func_warning "'$lib' has not been installed in '$libdir'"
	      finalize=false
	    fi
	  done

	  relink_command=
	  func_source "$wrapper"

	  outputname=
	  if test no = "$fast_install" && test -n "$relink_command"; then
	    $opt_dry_run || {
	      if $finalize; then
	        tmpdir=`func_mktempdir`
		func_basename "$file$stripped_ext"
		file=$func_basename_result
	        outputname=$tmpdir/$file
	        # Replace the output file specification.
	        relink_command=`$ECHO "$relink_command" | $SED 's%@OUTPUT@%'"$outputname"'%g'`

	        $opt_quiet || {
	          func_quote_for_expand "$relink_command"
		  eval "func_echo $func_quote_for_expand_result"
	        }
	        if eval "$relink_command"; then :
	          else
		  func_error "error: relink '$file' with the above command before installing it"
		  $opt_dry_run || ${RM}r "$tmpdir"
		  continue
	        fi
	        file=$outputname
	      else
	        func_warning "cannot relink '$file'"
	      fi
	    }
	  else
	    # Install the binary that we compiled earlier.
	    file=`$ECHO "$file$stripped_ext" | $SED "s%\([^/]*\)$%$objdir/\1%"`
	  fi
	fi

	# remove .exe since cygwin /usr/bin/install will append another
	# one anyway
	case $install_prog,$host in
	*/usr/bin/install*,*cygwin*)
	  case $file:$destfile in
	  *.exe:*.exe)
	    # this is ok
	    ;;
	  *.exe:*)
	    destfile=$destfile.exe
	    ;;
	  *:*.exe)
	    func_stripname '' '.exe' "$destfile"
	    destfile=$func_stripname_result
	    ;;
	  esac
	  ;;
	esac
	func_show_eval "$install_prog\$stripme \$file \$destfile" 'exit $?'
	$opt_dry_run || if test -n "$outputname"; then
	  ${RM}r "$tmpdir"
	fi
	;;
      esac
    done

    for file in $staticlibs; do
      func_basename "$file"
      name=$func_basename_result

      # Set up the ranlib parameters.
      oldlib=$destdir/$name
      func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
      tool_oldlib=$func_to_tool_file_result

      func_show_eval "$install_prog \$file \$oldlib" 'exit $?'

      if test -n "$stripme" && test -n "$old_striplib"; then
	func_show_eval "$old_striplib $tool_oldlib" 'exit $?'
      fi

      # Do each command in the postinstall commands.
      func_execute_cmds "$old_postinstall_cmds" 'exit $?'
    done

    test -n "$future_libdirs" && \
      func_warning "remember to run '$progname --finish$future_libdirs'"

    if test -n "$current_libdirs"; then
      # Maybe just do a dry run.
      $opt_dry_run && current_libdirs=" -n$current_libdirs"
      exec_cmd='$SHELL "$progpath" $preserve_args --finish$current_libdirs'
    else
      exit $EXIT_SUCCESS
    fi
}

test install = "$opt_mode" && func_mode_install ${1+"$@"}


# func_generate_dlsyms outputname originator pic_p
# Extract symbols from dlprefiles and create ${outputname}S.o with
# a dlpreopen symbol table.
func_generate_dlsyms ()
{
    $debug_cmd

    my_outputname=$1
    my_originator=$2
    my_pic_p=${3-false}
    my_prefix=`$ECHO "$my_originator" | $SED 's%[^a-zA-Z0-9]%_%g'`
    my_dlsyms=

    if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
      if test -n "$NM" && test -n "$global_symbol_pipe"; then
	my_dlsyms=${my_outputname}S.c
      else
	func_error "not configured to extract global symbols from dlpreopened files"
      fi
    fi

    if test -n "$my_dlsyms"; then
      case $my_dlsyms in
      "") ;;
      *.c)
	# Discover the nlist of each of the dlfiles.
	nlist=$output_objdir/$my_outputname.nm

	func_show_eval "$RM $nlist ${nlist}S ${nlist}T"

	# Parse the name list into a source file.
	func_verbose "creating $output_objdir/$my_dlsyms"

	$opt_dry_run || $ECHO > "$output_objdir/$my_dlsyms" "\
/* $my_dlsyms - symbol resolution table for '$my_outputname' dlsym emulation. */
/* Generated by $PROGRAM (GNU $PACKAGE) $VERSION */

#ifdef __cplusplus
extern \"C\" {
#endif

#if defined __GNUC__ && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4))
#pragma GCC diagnostic ignored \"-Wstrict-prototypes\"
#endif

/* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
#if defined _WIN32 || defined __CYGWIN__ || defined _WIN32_WCE
/* DATA imports from DLLs on WIN32 can't be const, because runtime
   relocations are performed -- see ld's documentation on pseudo-relocs.  */
# define LT_DLSYM_CONST
#elif defined __osf__
/* This system does not cope well with relocations in const data.  */
# define LT_DLSYM_CONST
#else
# define LT_DLSYM_CONST const
#endif

#define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0)

/* External symbol declarations for the compiler. */\
"

	if test yes = "$dlself"; then
	  func_verbose "generating symbol list for '$output'"

	  $opt_dry_run || echo ': @PROGRAM@ ' > "$nlist"

	  # Add our own program objects to the symbol list.
	  progfiles=`$ECHO "$objs$old_deplibs" | $SP2NL | $SED "$lo2o" | $NL2SP`
	  for progfile in $progfiles; do
	    func_to_tool_file "$progfile" func_convert_file_msys_to_w32
	    func_verbose "extracting global C symbols from '$func_to_tool_file_result'"
	    $opt_dry_run || eval "$NM $func_to_tool_file_result | $global_symbol_pipe >> '$nlist'"
	  done

	  if test -n "$exclude_expsyms"; then
	    $opt_dry_run || {
	      eval '$EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
	      eval '$MV "$nlist"T "$nlist"'
	    }
	  fi

	  if test -n "$export_symbols_regex"; then
	    $opt_dry_run || {
	      eval '$EGREP -e "$export_symbols_regex" "$nlist" > "$nlist"T'
	      eval '$MV "$nlist"T "$nlist"'
	    }
	  fi

	  # Prepare the list of exported symbols
	  if test -z "$export_symbols"; then
	    export_symbols=$output_objdir/$outputname.exp
	    $opt_dry_run || {
	      $RM $export_symbols
	      eval "$SED -n -e '/^: @PROGRAM@ $/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
	      case $host in
	      *cygwin* | *mingw* | *cegcc* )
                eval "echo EXPORTS "'> "$output_objdir/$outputname.def"'
                eval 'cat "$export_symbols" >> "$output_objdir/$outputname.def"'
	        ;;
	      esac
	    }
	  else
	    $opt_dry_run || {
	      eval "$SED -e 's/\([].[*^$]\)/\\\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$outputname.exp"'
	      eval '$GREP -f "$output_objdir/$outputname.exp" < "$nlist" > "$nlist"T'
	      eval '$MV "$nlist"T "$nlist"'
	      case $host in
	        *cygwin* | *mingw* | *cegcc* )
	          eval "echo EXPORTS "'> "$output_objdir/$outputname.def"'
	          eval 'cat "$nlist" >> "$output_objdir/$outputname.def"'
	          ;;
	      esac
	    }
	  fi
	fi

	for dlprefile in $dlprefiles; do
	  func_verbose "extracting global C symbols from '$dlprefile'"
	  func_basename "$dlprefile"
	  name=$func_basename_result
          case $host in
	    *cygwin* | *mingw* | *cegcc* )
	      # if an import library, we need to obtain dlname
	      if func_win32_import_lib_p "$dlprefile"; then
	        func_tr_sh "$dlprefile"
	        eval "curr_lafile=\$libfile_$func_tr_sh_result"
	        dlprefile_dlbasename=
	        if test -n "$curr_lafile" && func_lalib_p "$curr_lafile"; then
	          # Use subshell, to avoid clobbering current variable values
	          dlprefile_dlname=`source "$curr_lafile" && echo "$dlname"`
	          if test -n "$dlprefile_dlname"; then
	            func_basename "$dlprefile_dlname"
	            dlprefile_dlbasename=$func_basename_result
	          else
	            # no lafile. user explicitly requested -dlpreopen <import library>.
	            $sharedlib_from_linklib_cmd "$dlprefile"
	            dlprefile_dlbasename=$sharedlib_from_linklib_result
	          fi
	        fi
	        $opt_dry_run || {
	          if test -n "$dlprefile_dlbasename"; then
	            eval '$ECHO ": $dlprefile_dlbasename" >> "$nlist"'
	          else
	            func_warning "Could not compute DLL name from $name"
	            eval '$ECHO ": $name " >> "$nlist"'
	          fi
	          func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
	          eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe |
	            $SED -e '/I __imp/d' -e 's/I __nm_/D /;s/_nm__//' >> '$nlist'"
	        }
	      else # not an import lib
	        $opt_dry_run || {
	          eval '$ECHO ": $name " >> "$nlist"'
	          func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
	          eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'"
	        }
	      fi
	    ;;
	    *)
	      $opt_dry_run || {
	        eval '$ECHO ": $name " >> "$nlist"'
	        func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32
	        eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'"
	      }
	    ;;
          esac
	done

	$opt_dry_run || {
	  # Make sure we have at least an empty file.
	  test -f "$nlist" || : > "$nlist"

	  if test -n "$exclude_expsyms"; then
	    $EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
	    $MV "$nlist"T "$nlist"
	  fi

	  # Try sorting and uniquifying the output.
	  if $GREP -v "^: " < "$nlist" |
	      if sort -k 3 </dev/null >/dev/null 2>&1; then
		sort -k 3
	      else
		sort +2
	      fi |
	      uniq > "$nlist"S; then
	    :
	  else
	    $GREP -v "^: " < "$nlist" > "$nlist"S
	  fi

	  if test -f "$nlist"S; then
	    eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$my_dlsyms"'
	  else
	    echo '/* NONE */' >> "$output_objdir/$my_dlsyms"
	  fi

	  func_show_eval '$RM "${nlist}I"'
	  if test -n "$global_symbol_to_import"; then
	    eval "$global_symbol_to_import"' < "$nlist"S > "$nlist"I'
	  fi

	  echo >> "$output_objdir/$my_dlsyms" "\

/* The mapping between symbol names and symbols.  */
typedef struct {
  const char *name;
  void *address;
} lt_dlsymlist;
extern LT_DLSYM_CONST lt_dlsymlist
lt_${my_prefix}_LTX_preloaded_symbols[];\
"

	  if test -s "$nlist"I; then
	    echo >> "$output_objdir/$my_dlsyms" "\
static void lt_syminit(void)
{
  LT_DLSYM_CONST lt_dlsymlist *symbol = lt_${my_prefix}_LTX_preloaded_symbols;
  for (; symbol->name; ++symbol)
    {"
	    $SED 's/.*/      if (STREQ (symbol->name, \"&\")) symbol->address = (void *) \&&;/' < "$nlist"I >> "$output_objdir/$my_dlsyms"
	    echo >> "$output_objdir/$my_dlsyms" "\
    }
}"
	  fi
	  echo >> "$output_objdir/$my_dlsyms" "\
LT_DLSYM_CONST lt_dlsymlist
lt_${my_prefix}_LTX_preloaded_symbols[] =
{ {\"$my_originator\", (void *) 0},"

	  if test -s "$nlist"I; then
	    echo >> "$output_objdir/$my_dlsyms" "\
  {\"@INIT@\", (void *) &lt_syminit},"
	  fi

	  case $need_lib_prefix in
	  no)
	    eval "$global_symbol_to_c_name_address" < "$nlist" >> "$output_objdir/$my_dlsyms"
	    ;;
	  *)
	    eval "$global_symbol_to_c_name_address_lib_prefix" < "$nlist" >> "$output_objdir/$my_dlsyms"
	    ;;
	  esac
	  echo >> "$output_objdir/$my_dlsyms" "\
  {0, (void *) 0}
};

/* This works around a problem in FreeBSD linker */
#ifdef FREEBSD_WORKAROUND
static const void *lt_preloaded_setup() {
  return lt_${my_prefix}_LTX_preloaded_symbols;
}
#endif

#ifdef __cplusplus
}
#endif\
"
	} # !$opt_dry_run

	pic_flag_for_symtable=
	case "$compile_command " in
	*" -static "*) ;;
	*)
	  case $host in
	  # compiling the symbol table file with pic_flag works around
	  # a FreeBSD bug that causes programs to crash when -lm is
	  # linked before any other PIC object.  But we must not use
	  # pic_flag when linking with -static.  The problem exists in
	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
	  *-*-freebsd2.*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
	    pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND" ;;
	  *-*-hpux*)
	    pic_flag_for_symtable=" $pic_flag"  ;;
	  *)
	    $my_pic_p && pic_flag_for_symtable=" $pic_flag"
	    ;;
	  esac
	  ;;
	esac
	symtab_cflags=
	for arg in $LTCFLAGS; do
	  case $arg in
	  -pie | -fpie | -fPIE) ;;
	  *) func_append symtab_cflags " $arg" ;;
	  esac
	done

	# Now compile the dynamic symbol file.
	func_show_eval '(cd $output_objdir && $LTCC$symtab_cflags -c$no_builtin_flag$pic_flag_for_symtable "$my_dlsyms")' 'exit $?'

	# Clean up the generated files.
	func_show_eval '$RM "$output_objdir/$my_dlsyms" "$nlist" "${nlist}S" "${nlist}T" "${nlist}I"'

	# Transform the symbol file into the correct name.
	symfileobj=$output_objdir/${my_outputname}S.$objext
	case $host in
	*cygwin* | *mingw* | *cegcc* )
	  if test -f "$output_objdir/$my_outputname.def"; then
	    compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"`
	    finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"`
	  else
	    compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"`
	    finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"`
	  fi
	  ;;
	*)
	  compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"`
	  finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"`
	  ;;
	esac
	;;
      *)
	func_fatal_error "unknown suffix for '$my_dlsyms'"
	;;
      esac
    else
      # We keep going just in case the user didn't refer to
      # lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
      # really was required.

      # Nullify the symbol file.
      compile_command=`$ECHO "$compile_command" | $SED "s% @SYMFILE@%%"`
      finalize_command=`$ECHO "$finalize_command" | $SED "s% @SYMFILE@%%"`
    fi
}

# func_cygming_gnu_implib_p ARG
# This predicate returns with zero status (TRUE) if
# ARG is a GNU/binutils-style import library. Returns
# with nonzero status (FALSE) otherwise.
func_cygming_gnu_implib_p ()
{
  $debug_cmd

  func_to_tool_file "$1" func_convert_file_msys_to_w32
  func_cygming_gnu_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $EGREP ' (_head_[A-Za-z0-9_]+_[ad]l*|[A-Za-z0-9_]+_[ad]l*_iname)$'`
  test -n "$func_cygming_gnu_implib_tmp"
}

# func_cygming_ms_implib_p ARG
# This predicate returns with zero status (TRUE) if
# ARG is an MS-style import library. Returns
# with nonzero status (FALSE) otherwise.
func_cygming_ms_implib_p ()
{
  $debug_cmd

  func_to_tool_file "$1" func_convert_file_msys_to_w32
  func_cygming_ms_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $GREP '_NULL_IMPORT_DESCRIPTOR'`
  test -n "$func_cygming_ms_implib_tmp"
}

# func_win32_libid arg
# return the library type of file 'arg'
#
# Need a lot of goo to handle *both* DLLs and import libs
# Has to be a shell function in order to 'eat' the argument
# that is supplied when $file_magic_command is called.
# Despite the name, also deal with 64 bit binaries.
func_win32_libid ()
{
  $debug_cmd

  win32_libid_type=unknown
  win32_fileres=`file -L $1 2>/dev/null`
  case $win32_fileres in
  *ar\ archive\ import\ library*) # definitely import
    win32_libid_type="x86 archive import"
    ;;
  *ar\ archive*) # could be an import, or static
    # Keep the egrep pattern in sync with the one in _LT_CHECK_MAGIC_METHOD.
    if eval $OBJDUMP -f $1 | $SED -e '10q' 2>/dev/null |
       $EGREP 'file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)' >/dev/null; then
      case $nm_interface in
      "MS dumpbin")
	if func_cygming_ms_implib_p "$1" ||
	   func_cygming_gnu_implib_p "$1"
	then
	  win32_nmres=import
	else
	  win32_nmres=
	fi
	;;
      *)
	func_to_tool_file "$1" func_convert_file_msys_to_w32
	win32_nmres=`eval $NM -f posix -A \"$func_to_tool_file_result\" |
	  $SED -n -e '
	    1,100{
		/ I /{
		    s|.*|import|
		    p
		    q
		}
	    }'`
	;;
      esac
      case $win32_nmres in
      import*)  win32_libid_type="x86 archive import";;
      *)        win32_libid_type="x86 archive static";;
      esac
    fi
    ;;
  *DLL*)
    win32_libid_type="x86 DLL"
    ;;
  *executable*) # but shell scripts are "executable" too...
    case $win32_fileres in
    *MS\ Windows\ PE\ Intel*)
      win32_libid_type="x86 DLL"
      ;;
    esac
    ;;
  esac
  $ECHO "$win32_libid_type"
}

# func_cygming_dll_for_implib ARG
#
# Platform-specific function to extract the
# name of the DLL associated with the specified
# import library ARG.
# Invoked by eval'ing the libtool variable
#    $sharedlib_from_linklib_cmd
# Result is available in the variable
#    $sharedlib_from_linklib_result
func_cygming_dll_for_implib ()
{
  $debug_cmd

  sharedlib_from_linklib_result=`$DLLTOOL --identify-strict --identify "$1"`
}

# func_cygming_dll_for_implib_fallback_core SECTION_NAME LIBNAMEs
#
# The is the core of a fallback implementation of a
# platform-specific function to extract the name of the
# DLL associated with the specified import library LIBNAME.
#
# SECTION_NAME is either .idata$6 or .idata$7, depending
# on the platform and compiler that created the implib.
#
# Echos the name of the DLL associated with the
# specified import library.
func_cygming_dll_for_implib_fallback_core ()
{
  $debug_cmd

  match_literal=`$ECHO "$1" | $SED "$sed_make_literal_regex"`
  $OBJDUMP -s --section "$1" "$2" 2>/dev/null |
    $SED '/^Contents of section '"$match_literal"':/{
      # Place marker at beginning of archive member dllname section
      s/.*/====MARK====/
      p
      d
    }
    # These lines can sometimes be longer than 43 characters, but
    # are always uninteresting
    /:[	 ]*file format pe[i]\{,1\}-/d
    /^In archive [^:]*:/d
    # Ensure marker is printed
    /^====MARK====/p
    # Remove all lines with less than 43 characters
    /^.\{43\}/!d
    # From remaining lines, remove first 43 characters
    s/^.\{43\}//' |
    $SED -n '
      # Join marker and all lines until next marker into a single line
      /^====MARK====/ b para
      H
      $ b para
      b
      :para
      x
      s/\n//g
      # Remove the marker
      s/^====MARK====//
      # Remove trailing dots and whitespace
      s/[\. \t]*$//
      # Print
      /./p' |
    # we now have a list, one entry per line, of the stringified
    # contents of the appropriate section of all members of the
    # archive that possess that section. Heuristic: eliminate
    # all those that have a first or second character that is
    # a '.' (that is, objdump's representation of an unprintable
    # character.) This should work for all archives with less than
    # 0x302f exports -- but will fail for DLLs whose name actually
    # begins with a literal '.' or a single character followed by
    # a '.'.
    #
    # Of those that remain, print the first one.
    $SED -e '/^\./d;/^.\./d;q'
}

# func_cygming_dll_for_implib_fallback ARG
# Platform-specific function to extract the
# name of the DLL associated with the specified
# import library ARG.
#
# This fallback implementation is for use when $DLLTOOL
# does not support the --identify-strict option.
# Invoked by eval'ing the libtool variable
#    $sharedlib_from_linklib_cmd
# Result is available in the variable
#    $sharedlib_from_linklib_result
func_cygming_dll_for_implib_fallback ()
{
  $debug_cmd

  if func_cygming_gnu_implib_p "$1"; then
    # binutils import library
    sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$7' "$1"`
  elif func_cygming_ms_implib_p "$1"; then
    # ms-generated import library
    sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$6' "$1"`
  else
    # unknown
    sharedlib_from_linklib_result=
  fi
}


# func_extract_an_archive dir oldlib
func_extract_an_archive ()
{
    $debug_cmd

    f_ex_an_ar_dir=$1; shift
    f_ex_an_ar_oldlib=$1
    if test yes = "$lock_old_archive_extraction"; then
      lockfile=$f_ex_an_ar_oldlib.lock
      until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do
	func_echo "Waiting for $lockfile to be removed"
	sleep 2
      done
    fi
    func_show_eval "(cd \$f_ex_an_ar_dir && $AR x \"\$f_ex_an_ar_oldlib\")" \
		   'stat=$?; rm -f "$lockfile"; exit $stat'
    if test yes = "$lock_old_archive_extraction"; then
      $opt_dry_run || rm -f "$lockfile"
    fi
    if ($AR t "$f_ex_an_ar_oldlib" | sort | sort -uc >/dev/null 2>&1); then
     :
    else
      func_fatal_error "object name conflicts in archive: $f_ex_an_ar_dir/$f_ex_an_ar_oldlib"
    fi
}


# func_extract_archives gentop oldlib ...
func_extract_archives ()
{
    $debug_cmd

    my_gentop=$1; shift
    my_oldlibs=${1+"$@"}
    my_oldobjs=
    my_xlib=
    my_xabs=
    my_xdir=

    for my_xlib in $my_oldlibs; do
      # Extract the objects.
      case $my_xlib in
	[\\/]* | [A-Za-z]:[\\/]*) my_xabs=$my_xlib ;;
	*) my_xabs=`pwd`"/$my_xlib" ;;
      esac
      func_basename "$my_xlib"
      my_xlib=$func_basename_result
      my_xlib_u=$my_xlib
      while :; do
        case " $extracted_archives " in
	*" $my_xlib_u "*)
	  func_arith $extracted_serial + 1
	  extracted_serial=$func_arith_result
	  my_xlib_u=lt$extracted_serial-$my_xlib ;;
	*) break ;;
	esac
      done
      extracted_archives="$extracted_archives $my_xlib_u"
      my_xdir=$my_gentop/$my_xlib_u

      func_mkdir_p "$my_xdir"

      case $host in
      *-darwin*)
	func_verbose "Extracting $my_xabs"
	# Do not bother doing anything if just a dry run
	$opt_dry_run || {
	  darwin_orig_dir=`pwd`
	  cd $my_xdir || exit $?
	  darwin_archive=$my_xabs
	  darwin_curdir=`pwd`
	  func_basename "$darwin_archive"
	  darwin_base_archive=$func_basename_result
	  darwin_arches=`$LIPO -info "$darwin_archive" 2>/dev/null | $GREP Architectures 2>/dev/null || true`
	  if test -n "$darwin_arches"; then
	    darwin_arches=`$ECHO "$darwin_arches" | $SED -e 's/.*are://'`
	    darwin_arch=
	    func_verbose "$darwin_base_archive has multiple architectures $darwin_arches"
	    for darwin_arch in  $darwin_arches; do
	      func_mkdir_p "unfat-$$/$darwin_base_archive-$darwin_arch"
	      $LIPO -thin $darwin_arch -output "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive" "$darwin_archive"
	      cd "unfat-$$/$darwin_base_archive-$darwin_arch"
	      func_extract_an_archive "`pwd`" "$darwin_base_archive"
	      cd "$darwin_curdir"
	      $RM "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive"
	    done # $darwin_arches
            ## Okay now we've a bunch of thin objects, gotta fatten them up :)
	    darwin_filelist=`find unfat-$$ -type f -name \*.o -print -o -name \*.lo -print | $SED -e "$sed_basename" | sort -u`
	    darwin_file=
	    darwin_files=
	    for darwin_file in $darwin_filelist; do
	      darwin_files=`find unfat-$$ -name $darwin_file -print | sort | $NL2SP`
	      $LIPO -create -output "$darwin_file" $darwin_files
	    done # $darwin_filelist
	    $RM -rf unfat-$$
	    cd "$darwin_orig_dir"
	  else
	    cd $darwin_orig_dir
	    func_extract_an_archive "$my_xdir" "$my_xabs"
	  fi # $darwin_arches
	} # !$opt_dry_run
	;;
      *)
        func_extract_an_archive "$my_xdir" "$my_xabs"
	;;
      esac
      my_oldobjs="$my_oldobjs "`find $my_xdir -name \*.$objext -print -o -name \*.lo -print | sort | $NL2SP`
    done

    func_extract_archives_result=$my_oldobjs
}


# func_emit_wrapper [arg=no]
#
# Emit a libtool wrapper script on stdout.
# Don't directly open a file because we may want to
# incorporate the script contents within a cygwin/mingw
# wrapper executable.  Must ONLY be called from within
# func_mode_link because it depends on a number of variables
# set therein.
#
# ARG is the value that the WRAPPER_SCRIPT_BELONGS_IN_OBJDIR
# variable will take.  If 'yes', then the emitted script
# will assume that the directory where it is stored is
# the $objdir directory.  This is a cygwin/mingw-specific
# behavior.
func_emit_wrapper ()
{
	func_emit_wrapper_arg1=${1-no}

	$ECHO "\
#! $SHELL

# $output - temporary wrapper script for $objdir/$outputname
# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
#
# The $output program cannot be directly executed until all the libtool
# libraries that it depends on are installed.
#
# This wrapper script should never be moved out of the build directory.
# If it is, it will not operate correctly.

# Sed substitution that helps us do robust quoting.  It backslashifies
# metacharacters that are still active within double-quoted strings.
sed_quote_subst='$sed_quote_subst'

# Be Bourne compatible
if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then
  emulate sh
  NULLCMD=:
  # Zsh 3.x and 4.x performs word splitting on \${1+\"\$@\"}, which
  # is contrary to our usage.  Disable this feature.
  alias -g '\${1+\"\$@\"}'='\"\$@\"'
  setopt NO_GLOB_SUBST
else
  case \`(set -o) 2>/dev/null\` in *posix*) set -o posix;; esac
fi
BIN_SH=xpg4; export BIN_SH # for Tru64
DUALCASE=1; export DUALCASE # for MKS sh

# The HP-UX ksh and POSIX shell print the target directory to stdout
# if CDPATH is set.
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH

relink_command=\"$relink_command\"

# This environment variable determines our operation mode.
if test \"\$libtool_install_magic\" = \"$magic\"; then
  # install mode needs the following variables:
  generated_by_libtool_version='$macro_version'
  notinst_deplibs='$notinst_deplibs'
else
  # When we are sourced in execute mode, \$file and \$ECHO are already set.
  if test \"\$libtool_execute_magic\" != \"$magic\"; then
    file=\"\$0\""

    qECHO=`$ECHO "$ECHO" | $SED "$sed_quote_subst"`
    $ECHO "\

# A function that is used when there is no print builtin or printf.
func_fallback_echo ()
{
  eval 'cat <<_LTECHO_EOF
\$1
_LTECHO_EOF'
}
    ECHO=\"$qECHO\"
  fi

# Very basic option parsing. These options are (a) specific to
# the libtool wrapper, (b) are identical between the wrapper
# /script/ and the wrapper /executable/ that is used only on
# windows platforms, and (c) all begin with the string "--lt-"
# (application programs are unlikely to have options that match
# this pattern).
#
# There are only two supported options: --lt-debug and
# --lt-dump-script. There is, deliberately, no --lt-help.
#
# The first argument to this parsing function should be the
# script's $0 value, followed by "$@".
lt_option_debug=
func_parse_lt_options ()
{
  lt_script_arg0=\$0
  shift
  for lt_opt
  do
    case \"\$lt_opt\" in
    --lt-debug) lt_option_debug=1 ;;
    --lt-dump-script)
        lt_dump_D=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%/[^/]*$%%'\`
        test \"X\$lt_dump_D\" = \"X\$lt_script_arg0\" && lt_dump_D=.
        lt_dump_F=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%^.*/%%'\`
        cat \"\$lt_dump_D/\$lt_dump_F\"
        exit 0
      ;;
    --lt-*)
        \$ECHO \"Unrecognized --lt- option: '\$lt_opt'\" 1>&2
        exit 1
      ;;
    esac
  done

  # Print the debug banner immediately:
  if test -n \"\$lt_option_debug\"; then
    echo \"$outputname:$output:\$LINENO: libtool wrapper (GNU $PACKAGE) $VERSION\" 1>&2
  fi
}

# Used when --lt-debug. Prints its arguments to stdout
# (redirection is the responsibility of the caller)
func_lt_dump_args ()
{
  lt_dump_args_N=1;
  for lt_arg
  do
    \$ECHO \"$outputname:$output:\$LINENO: newargv[\$lt_dump_args_N]: \$lt_arg\"
    lt_dump_args_N=\`expr \$lt_dump_args_N + 1\`
  done
}

# Core function for launching the target application
func_exec_program_core ()
{
"
  case $host in
  # Backslashes separate directories on plain windows
  *-*-mingw | *-*-os2* | *-cegcc*)
    $ECHO "\
      if test -n \"\$lt_option_debug\"; then
        \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir\\\\\$program\" 1>&2
        func_lt_dump_args \${1+\"\$@\"} 1>&2
      fi
      exec \"\$progdir\\\\\$program\" \${1+\"\$@\"}
"
    ;;

  *)
    $ECHO "\
      if test -n \"\$lt_option_debug\"; then
        \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir/\$program\" 1>&2
        func_lt_dump_args \${1+\"\$@\"} 1>&2
      fi
      exec \"\$progdir/\$program\" \${1+\"\$@\"}
"
    ;;
  esac
  $ECHO "\
      \$ECHO \"\$0: cannot exec \$program \$*\" 1>&2
      exit 1
}

# A function to encapsulate launching the target application
# Strips options in the --lt-* namespace from \$@ and
# launches target application with the remaining arguments.
func_exec_program ()
{
  case \" \$* \" in
  *\\ --lt-*)
    for lt_wr_arg
    do
      case \$lt_wr_arg in
      --lt-*) ;;
      *) set x \"\$@\" \"\$lt_wr_arg\"; shift;;
      esac
      shift
    done ;;
  esac
  func_exec_program_core \${1+\"\$@\"}
}

  # Parse options
  func_parse_lt_options \"\$0\" \${1+\"\$@\"}

  # Find the directory that this script lives in.
  thisdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*$%%'\`
  test \"x\$thisdir\" = \"x\$file\" && thisdir=.

  # Follow symbolic links until we get to the real thisdir.
  file=\`ls -ld \"\$file\" | $SED -n 's/.*-> //p'\`
  while test -n \"\$file\"; do
    destdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*\$%%'\`

    # If there was a directory component, then change thisdir.
    if test \"x\$destdir\" != \"x\$file\"; then
      case \"\$destdir\" in
      [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;;
      *) thisdir=\"\$thisdir/\$destdir\" ;;
      esac
    fi

    file=\`\$ECHO \"\$file\" | $SED 's%^.*/%%'\`
    file=\`ls -ld \"\$thisdir/\$file\" | $SED -n 's/.*-> //p'\`
  done

  # Usually 'no', except on cygwin/mingw when embedded into
  # the cwrapper.
  WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=$func_emit_wrapper_arg1
  if test \"\$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR\" = \"yes\"; then
    # special case for '.'
    if test \"\$thisdir\" = \".\"; then
      thisdir=\`pwd\`
    fi
    # remove .libs from thisdir
    case \"\$thisdir\" in
    *[\\\\/]$objdir ) thisdir=\`\$ECHO \"\$thisdir\" | $SED 's%[\\\\/][^\\\\/]*$%%'\` ;;
    $objdir )   thisdir=. ;;
    esac
  fi

  # Try to get the absolute directory name.
  absdir=\`cd \"\$thisdir\" && pwd\`
  test -n \"\$absdir\" && thisdir=\"\$absdir\"
"

	if test yes = "$fast_install"; then
	  $ECHO "\
  program=lt-'$outputname'$exeext
  progdir=\"\$thisdir/$objdir\"

  if test ! -f \"\$progdir/\$program\" ||
     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | $SED 1q\`; \\
       test \"X\$file\" != \"X\$progdir/\$program\"; }; then

    file=\"\$\$-\$program\"

    if test ! -d \"\$progdir\"; then
      $MKDIR \"\$progdir\"
    else
      $RM \"\$progdir/\$file\"
    fi"

	  $ECHO "\

    # relink executable if necessary
    if test -n \"\$relink_command\"; then
      if relink_command_output=\`eval \$relink_command 2>&1\`; then :
      else
	\$ECHO \"\$relink_command_output\" >&2
	$RM \"\$progdir/\$file\"
	exit 1
      fi
    fi

    $MV \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
    { $RM \"\$progdir/\$program\";
      $MV \"\$progdir/\$file\" \"\$progdir/\$program\"; }
    $RM \"\$progdir/\$file\"
  fi"
	else
	  $ECHO "\
  program='$outputname'
  progdir=\"\$thisdir/$objdir\"
"
	fi

	$ECHO "\

  if test -f \"\$progdir/\$program\"; then"

	# fixup the dll searchpath if we need to.
	#
	# Fix the DLL searchpath if we need to.  Do this before prepending
	# to shlibpath, because on Windows, both are PATH and uninstalled
	# libraries must come first.
	if test -n "$dllsearchpath"; then
	  $ECHO "\
    # Add the dll search path components to the executable PATH
    PATH=$dllsearchpath:\$PATH
"
	fi

	# Export our shlibpath_var if we have one.
	if test yes = "$shlibpath_overrides_runpath" && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
	  $ECHO "\
    # Add our own library path to $shlibpath_var
    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"

    # Some systems cannot cope with colon-terminated $shlibpath_var
    # The second colon is a workaround for a bug in BeOS R4 sed
    $shlibpath_var=\`\$ECHO \"\$$shlibpath_var\" | $SED 's/::*\$//'\`

    export $shlibpath_var
"
	fi

	$ECHO "\
    if test \"\$libtool_execute_magic\" != \"$magic\"; then
      # Run the actual program with our arguments.
      func_exec_program \${1+\"\$@\"}
    fi
  else
    # The program doesn't exist.
    \$ECHO \"\$0: error: '\$progdir/\$program' does not exist\" 1>&2
    \$ECHO \"This script is just a wrapper for \$program.\" 1>&2
    \$ECHO \"See the $PACKAGE documentation for more information.\" 1>&2
    exit 1
  fi
fi\
"
}


# func_emit_cwrapperexe_src
# emit the source code for a wrapper executable on stdout
# Must ONLY be called from within func_mode_link because
# it depends on a number of variable set therein.
func_emit_cwrapperexe_src ()
{
	cat <<EOF

/* $cwrappersource - temporary wrapper executable for $objdir/$outputname
   Generated by $PROGRAM (GNU $PACKAGE) $VERSION

   The $output program cannot be directly executed until all the libtool
   libraries that it depends on are installed.

   This wrapper executable should never be moved out of the build directory.
   If it is, it will not operate correctly.
*/
EOF
	    cat <<"EOF"
#ifdef _MSC_VER
# define _CRT_SECURE_NO_DEPRECATE 1
#endif
#include <stdio.h>
#include <stdlib.h>
#ifdef _MSC_VER
# include <direct.h>
# include <process.h>
# include <io.h>
#else
# include <unistd.h>
# include <stdint.h>
# ifdef __CYGWIN__
#  include <io.h>
# endif
#endif
#include <malloc.h>
#include <stdarg.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>

#define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0)

/* declarations of non-ANSI functions */
#if defined __MINGW32__
# ifdef __STRICT_ANSI__
int _putenv (const char *);
# endif
#elif defined __CYGWIN__
# ifdef __STRICT_ANSI__
char *realpath (const char *, char *);
int putenv (char *);
int setenv (const char *, const char *, int);
# endif
/* #elif defined other_platform || defined ... */
#endif

/* portability defines, excluding path handling macros */
#if defined _MSC_VER
# define setmode _setmode
# define stat    _stat
# define chmod   _chmod
# define getcwd  _getcwd
# define putenv  _putenv
# define S_IXUSR _S_IEXEC
#elif defined __MINGW32__
# define setmode _setmode
# define stat    _stat
# define chmod   _chmod
# define getcwd  _getcwd
# define putenv  _putenv
#elif defined __CYGWIN__
# define HAVE_SETENV
# define FOPEN_WB "wb"
/* #elif defined other platforms ... */
#endif

#if defined PATH_MAX
# define LT_PATHMAX PATH_MAX
#elif defined MAXPATHLEN
# define LT_PATHMAX MAXPATHLEN
#else
# define LT_PATHMAX 1024
#endif

#ifndef S_IXOTH
# define S_IXOTH 0
#endif
#ifndef S_IXGRP
# define S_IXGRP 0
#endif

/* path handling portability macros */
#ifndef DIR_SEPARATOR
# define DIR_SEPARATOR '/'
# define PATH_SEPARATOR ':'
#endif

#if defined _WIN32 || defined __MSDOS__ || defined __DJGPP__ || \
  defined __OS2__
# define HAVE_DOS_BASED_FILE_SYSTEM
# define FOPEN_WB "wb"
# ifndef DIR_SEPARATOR_2
#  define DIR_SEPARATOR_2 '\\'
# endif
# ifndef PATH_SEPARATOR_2
#  define PATH_SEPARATOR_2 ';'
# endif
#endif

#ifndef DIR_SEPARATOR_2
# define IS_DIR_SEPARATOR(ch) ((ch) == DIR_SEPARATOR)
#else /* DIR_SEPARATOR_2 */
# define IS_DIR_SEPARATOR(ch) \
	(((ch) == DIR_SEPARATOR) || ((ch) == DIR_SEPARATOR_2))
#endif /* DIR_SEPARATOR_2 */

#ifndef PATH_SEPARATOR_2
# define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR)
#else /* PATH_SEPARATOR_2 */
# define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR_2)
#endif /* PATH_SEPARATOR_2 */

#ifndef FOPEN_WB
# define FOPEN_WB "w"
#endif
#ifndef _O_BINARY
# define _O_BINARY 0
#endif

#define XMALLOC(type, num)      ((type *) xmalloc ((num) * sizeof(type)))
#define XFREE(stale) do { \
  if (stale) { free (stale); stale = 0; } \
} while (0)

#if defined LT_DEBUGWRAPPER
static int lt_debug = 1;
#else
static int lt_debug = 0;
#endif

const char *program_name = "libtool-wrapper"; /* in case xstrdup fails */

void *xmalloc (size_t num);
char *xstrdup (const char *string);
const char *base_name (const char *name);
char *find_executable (const char *wrapper);
char *chase_symlinks (const char *pathspec);
int make_executable (const char *path);
int check_executable (const char *path);
char *strendzap (char *str, const char *pat);
void lt_debugprintf (const char *file, int line, const char *fmt, ...);
void lt_fatal (const char *file, int line, const char *message, ...);
static const char *nonnull (const char *s);
static const char *nonempty (const char *s);
void lt_setenv (const char *name, const char *value);
char *lt_extend_str (const char *orig_value, const char *add, int to_end);
void lt_update_exe_path (const char *name, const char *value);
void lt_update_lib_path (const char *name, const char *value);
char **prepare_spawn (char **argv);
void lt_dump_script (FILE *f);
EOF

	    cat <<EOF
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5)
# define externally_visible volatile
#else
# define externally_visible __attribute__((externally_visible)) volatile
#endif
externally_visible const char * MAGIC_EXE = "$magic_exe";
const char * LIB_PATH_VARNAME = "$shlibpath_var";
EOF

	    if test yes = "$shlibpath_overrides_runpath" && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
              func_to_host_path "$temp_rpath"
	      cat <<EOF
const char * LIB_PATH_VALUE   = "$func_to_host_path_result";
EOF
	    else
	      cat <<"EOF"
const char * LIB_PATH_VALUE   = "";
EOF
	    fi

	    if test -n "$dllsearchpath"; then
              func_to_host_path "$dllsearchpath:"
	      cat <<EOF
const char * EXE_PATH_VARNAME = "PATH";
const char * EXE_PATH_VALUE   = "$func_to_host_path_result";
EOF
	    else
	      cat <<"EOF"
const char * EXE_PATH_VARNAME = "";
const char * EXE_PATH_VALUE   = "";
EOF
	    fi

	    if test yes = "$fast_install"; then
	      cat <<EOF
const char * TARGET_PROGRAM_NAME = "lt-$outputname"; /* hopefully, no .exe */
EOF
	    else
	      cat <<EOF
const char * TARGET_PROGRAM_NAME = "$outputname"; /* hopefully, no .exe */
EOF
	    fi


	    cat <<"EOF"

#define LTWRAPPER_OPTION_PREFIX         "--lt-"

static const char *ltwrapper_option_prefix = LTWRAPPER_OPTION_PREFIX;
static const char *dumpscript_opt       = LTWRAPPER_OPTION_PREFIX "dump-script";
static const char *debug_opt            = LTWRAPPER_OPTION_PREFIX "debug";

int
main (int argc, char *argv[])
{
  char **newargz;
  int  newargc;
  char *tmp_pathspec;
  char *actual_cwrapper_path;
  char *actual_cwrapper_name;
  char *target_name;
  char *lt_argv_zero;
  int rval = 127;

  int i;

  program_name = (char *) xstrdup (base_name (argv[0]));
  newargz = XMALLOC (char *, (size_t) argc + 1);

  /* very simple arg parsing; don't want to rely on getopt
   * also, copy all non cwrapper options to newargz, except
   * argz[0], which is handled differently
   */
  newargc=0;
  for (i = 1; i < argc; i++)
    {
      if (STREQ (argv[i], dumpscript_opt))
	{
EOF
	    case $host in
	      *mingw* | *cygwin* )
		# make stdout use "unix" line endings
		echo "          setmode(1,_O_BINARY);"
		;;
	      esac

	    cat <<"EOF"
	  lt_dump_script (stdout);
	  return 0;
	}
      if (STREQ (argv[i], debug_opt))
	{
          lt_debug = 1;
          continue;
	}
      if (STREQ (argv[i], ltwrapper_option_prefix))
        {
          /* however, if there is an option in the LTWRAPPER_OPTION_PREFIX
             namespace, but it is not one of the ones we know about and
             have already dealt with, above (inluding dump-script), then
             report an error. Otherwise, targets might begin to believe
             they are allowed to use options in the LTWRAPPER_OPTION_PREFIX
             namespace. The first time any user complains about this, we'll
             need to make LTWRAPPER_OPTION_PREFIX a configure-time option
             or a configure.ac-settable value.
           */
          lt_fatal (__FILE__, __LINE__,
		    "unrecognized %s option: '%s'",
                    ltwrapper_option_prefix, argv[i]);
        }
      /* otherwise ... */
      newargz[++newargc] = xstrdup (argv[i]);
    }
  newargz[++newargc] = NULL;

EOF
	    cat <<EOF
  /* The GNU banner must be the first non-error debug message */
  lt_debugprintf (__FILE__, __LINE__, "libtool wrapper (GNU $PACKAGE) $VERSION\n");
EOF
	    cat <<"EOF"
  lt_debugprintf (__FILE__, __LINE__, "(main) argv[0]: %s\n", argv[0]);
  lt_debugprintf (__FILE__, __LINE__, "(main) program_name: %s\n", program_name);

  tmp_pathspec = find_executable (argv[0]);
  if (tmp_pathspec == NULL)
    lt_fatal (__FILE__, __LINE__, "couldn't find %s", argv[0]);
  lt_debugprintf (__FILE__, __LINE__,
                  "(main) found exe (before symlink chase) at: %s\n",
		  tmp_pathspec);

  actual_cwrapper_path = chase_symlinks (tmp_pathspec);
  lt_debugprintf (__FILE__, __LINE__,
                  "(main) found exe (after symlink chase) at: %s\n",
		  actual_cwrapper_path);
  XFREE (tmp_pathspec);

  actual_cwrapper_name = xstrdup (base_name (actual_cwrapper_path));
  strendzap (actual_cwrapper_path, actual_cwrapper_name);

  /* wrapper name transforms */
  strendzap (actual_cwrapper_name, ".exe");
  tmp_pathspec = lt_extend_str (actual_cwrapper_name, ".exe", 1);
  XFREE (actual_cwrapper_name);
  actual_cwrapper_name = tmp_pathspec;
  tmp_pathspec = 0;

  /* target_name transforms -- use actual target program name; might have lt- prefix */
  target_name = xstrdup (base_name (TARGET_PROGRAM_NAME));
  strendzap (target_name, ".exe");
  tmp_pathspec = lt_extend_str (target_name, ".exe", 1);
  XFREE (target_name);
  target_name = tmp_pathspec;
  tmp_pathspec = 0;

  lt_debugprintf (__FILE__, __LINE__,
		  "(main) libtool target name: %s\n",
		  target_name);
EOF

	    cat <<EOF
  newargz[0] =
    XMALLOC (char, (strlen (actual_cwrapper_path) +
		    strlen ("$objdir") + 1 + strlen (actual_cwrapper_name) + 1));
  strcpy (newargz[0], actual_cwrapper_path);
  strcat (newargz[0], "$objdir");
  strcat (newargz[0], "/");
EOF

	    cat <<"EOF"
  /* stop here, and copy so we don't have to do this twice */
  tmp_pathspec = xstrdup (newargz[0]);

  /* do NOT want the lt- prefix here, so use actual_cwrapper_name */
  strcat (newargz[0], actual_cwrapper_name);

  /* DO want the lt- prefix here if it exists, so use target_name */
  lt_argv_zero = lt_extend_str (tmp_pathspec, target_name, 1);
  XFREE (tmp_pathspec);
  tmp_pathspec = NULL;
EOF

	    case $host_os in
	      mingw*)
	    cat <<"EOF"
  {
    char* p;
    while ((p = strchr (newargz[0], '\\')) != NULL)
      {
	*p = '/';
      }
    while ((p = strchr (lt_argv_zero, '\\')) != NULL)
      {
	*p = '/';
      }
  }
EOF
	    ;;
	    esac

	    cat <<"EOF"
  XFREE (target_name);
  XFREE (actual_cwrapper_path);
  XFREE (actual_cwrapper_name);

  lt_setenv ("BIN_SH", "xpg4"); /* for Tru64 */
  lt_setenv ("DUALCASE", "1");  /* for MSK sh */
  /* Update the DLL searchpath.  EXE_PATH_VALUE ($dllsearchpath) must
     be prepended before (that is, appear after) LIB_PATH_VALUE ($temp_rpath)
     because on Windows, both *_VARNAMEs are PATH but uninstalled
     libraries must come first. */
  lt_update_exe_path (EXE_PATH_VARNAME, EXE_PATH_VALUE);
  lt_update_lib_path (LIB_PATH_VARNAME, LIB_PATH_VALUE);

  lt_debugprintf (__FILE__, __LINE__, "(main) lt_argv_zero: %s\n",
		  nonnull (lt_argv_zero));
  for (i = 0; i < newargc; i++)
    {
      lt_debugprintf (__FILE__, __LINE__, "(main) newargz[%d]: %s\n",
		      i, nonnull (newargz[i]));
    }

EOF

	    case $host_os in
	      mingw*)
		cat <<"EOF"
  /* execv doesn't actually work on mingw as expected on unix */
  newargz = prepare_spawn (newargz);
  rval = (int) _spawnv (_P_WAIT, lt_argv_zero, (const char * const *) newargz);
  if (rval == -1)
    {
      /* failed to start process */
      lt_debugprintf (__FILE__, __LINE__,
		      "(main) failed to launch target \"%s\": %s\n",
		      lt_argv_zero, nonnull (strerror (errno)));
      return 127;
    }
  return rval;
EOF
		;;
	      *)
		cat <<"EOF"
  execv (lt_argv_zero, newargz);
  return rval; /* =127, but avoids unused variable warning */
EOF
		;;
	    esac

	    cat <<"EOF"
}

void *
xmalloc (size_t num)
{
  void *p = (void *) malloc (num);
  if (!p)
    lt_fatal (__FILE__, __LINE__, "memory exhausted");

  return p;
}

char *
xstrdup (const char *string)
{
  return string ? strcpy ((char *) xmalloc (strlen (string) + 1),
			  string) : NULL;
}

const char *
base_name (const char *name)
{
  const char *base;

#if defined HAVE_DOS_BASED_FILE_SYSTEM
  /* Skip over the disk name in MSDOS pathnames. */
  if (isalpha ((unsigned char) name[0]) && name[1] == ':')
    name += 2;
#endif

  for (base = name; *name; name++)
    if (IS_DIR_SEPARATOR (*name))
      base = name + 1;
  return base;
}

int
check_executable (const char *path)
{
  struct stat st;

  lt_debugprintf (__FILE__, __LINE__, "(check_executable): %s\n",
                  nonempty (path));
  if ((!path) || (!*path))
    return 0;

  if ((stat (path, &st) >= 0)
      && (st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
    return 1;
  else
    return 0;
}

int
make_executable (const char *path)
{
  int rval = 0;
  struct stat st;

  lt_debugprintf (__FILE__, __LINE__, "(make_executable): %s\n",
                  nonempty (path));
  if ((!path) || (!*path))
    return 0;

  if (stat (path, &st) >= 0)
    {
      rval = chmod (path, st.st_mode | S_IXOTH | S_IXGRP | S_IXUSR);
    }
  return rval;
}

/* Searches for the full path of the wrapper.  Returns
   newly allocated full path name if found, NULL otherwise
   Does not chase symlinks, even on platforms that support them.
*/
char *
find_executable (const char *wrapper)
{
  int has_slash = 0;
  const char *p;
  const char *p_next;
  /* static buffer for getcwd */
  char tmp[LT_PATHMAX + 1];
  size_t tmp_len;
  char *concat_name;

  lt_debugprintf (__FILE__, __LINE__, "(find_executable): %s\n",
                  nonempty (wrapper));

  if ((wrapper == NULL) || (*wrapper == '\0'))
    return NULL;

  /* Absolute path? */
#if defined HAVE_DOS_BASED_FILE_SYSTEM
  if (isalpha ((unsigned char) wrapper[0]) && wrapper[1] == ':')
    {
      concat_name = xstrdup (wrapper);
      if (check_executable (concat_name))
	return concat_name;
      XFREE (concat_name);
    }
  else
    {
#endif
      if (IS_DIR_SEPARATOR (wrapper[0]))
	{
	  concat_name = xstrdup (wrapper);
	  if (check_executable (concat_name))
	    return concat_name;
	  XFREE (concat_name);
	}
#if defined HAVE_DOS_BASED_FILE_SYSTEM
    }
#endif

  for (p = wrapper; *p; p++)
    if (*p == '/')
      {
	has_slash = 1;
	break;
      }
  if (!has_slash)
    {
      /* no slashes; search PATH */
      const char *path = getenv ("PATH");
      if (path != NULL)
	{
	  for (p = path; *p; p = p_next)
	    {
	      const char *q;
	      size_t p_len;
	      for (q = p; *q; q++)
		if (IS_PATH_SEPARATOR (*q))
		  break;
	      p_len = (size_t) (q - p);
	      p_next = (*q == '\0' ? q : q + 1);
	      if (p_len == 0)
		{
		  /* empty path: current directory */
		  if (getcwd (tmp, LT_PATHMAX) == NULL)
		    lt_fatal (__FILE__, __LINE__, "getcwd failed: %s",
                              nonnull (strerror (errno)));
		  tmp_len = strlen (tmp);
		  concat_name =
		    XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1);
		  memcpy (concat_name, tmp, tmp_len);
		  concat_name[tmp_len] = '/';
		  strcpy (concat_name + tmp_len + 1, wrapper);
		}
	      else
		{
		  concat_name =
		    XMALLOC (char, p_len + 1 + strlen (wrapper) + 1);
		  memcpy (concat_name, p, p_len);
		  concat_name[p_len] = '/';
		  strcpy (concat_name + p_len + 1, wrapper);
		}
	      if (check_executable (concat_name))
		return concat_name;
	      XFREE (concat_name);
	    }
	}
      /* not found in PATH; assume curdir */
    }
  /* Relative path | not found in path: prepend cwd */
  if (getcwd (tmp, LT_PATHMAX) == NULL)
    lt_fatal (__FILE__, __LINE__, "getcwd failed: %s",
              nonnull (strerror (errno)));
  tmp_len = strlen (tmp);
  concat_name = XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1);
  memcpy (concat_name, tmp, tmp_len);
  concat_name[tmp_len] = '/';
  strcpy (concat_name + tmp_len + 1, wrapper);

  if (check_executable (concat_name))
    return concat_name;
  XFREE (concat_name);
  return NULL;
}

char *
chase_symlinks (const char *pathspec)
{
#ifndef S_ISLNK
  return xstrdup (pathspec);
#else
  char buf[LT_PATHMAX];
  struct stat s;
  char *tmp_pathspec = xstrdup (pathspec);
  char *p;
  int has_symlinks = 0;
  while (strlen (tmp_pathspec) && !has_symlinks)
    {
      lt_debugprintf (__FILE__, __LINE__,
		      "checking path component for symlinks: %s\n",
		      tmp_pathspec);
      if (lstat (tmp_pathspec, &s) == 0)
	{
	  if (S_ISLNK (s.st_mode) != 0)
	    {
	      has_symlinks = 1;
	      break;
	    }

	  /* search backwards for last DIR_SEPARATOR */
	  p = tmp_pathspec + strlen (tmp_pathspec) - 1;
	  while ((p > tmp_pathspec) && (!IS_DIR_SEPARATOR (*p)))
	    p--;
	  if ((p == tmp_pathspec) && (!IS_DIR_SEPARATOR (*p)))
	    {
	      /* no more DIR_SEPARATORS left */
	      break;
	    }
	  *p = '\0';
	}
      else
	{
	  lt_fatal (__FILE__, __LINE__,
		    "error accessing file \"%s\": %s",
		    tmp_pathspec, nonnull (strerror (errno)));
	}
    }
  XFREE (tmp_pathspec);

  if (!has_symlinks)
    {
      return xstrdup (pathspec);
    }

  tmp_pathspec = realpath (pathspec, buf);
  if (tmp_pathspec == 0)
    {
      lt_fatal (__FILE__, __LINE__,
		"could not follow symlinks for %s", pathspec);
    }
  return xstrdup (tmp_pathspec);
#endif
}

char *
strendzap (char *str, const char *pat)
{
  size_t len, patlen;

  assert (str != NULL);
  assert (pat != NULL);

  len = strlen (str);
  patlen = strlen (pat);

  if (patlen <= len)
    {
      str += len - patlen;
      if (STREQ (str, pat))
	*str = '\0';
    }
  return str;
}

void
lt_debugprintf (const char *file, int line, const char *fmt, ...)
{
  va_list args;
  if (lt_debug)
    {
      (void) fprintf (stderr, "%s:%s:%d: ", program_name, file, line);
      va_start (args, fmt);
      (void) vfprintf (stderr, fmt, args);
      va_end (args);
    }
}

static void
lt_error_core (int exit_status, const char *file,
	       int line, const char *mode,
	       const char *message, va_list ap)
{
  fprintf (stderr, "%s:%s:%d: %s: ", program_name, file, line, mode);
  vfprintf (stderr, message, ap);
  fprintf (stderr, ".\n");

  if (exit_status >= 0)
    exit (exit_status);
}

void
lt_fatal (const char *file, int line, const char *message, ...)
{
  va_list ap;
  va_start (ap, message);
  lt_error_core (EXIT_FAILURE, file, line, "FATAL", message, ap);
  va_end (ap);
}

static const char *
nonnull (const char *s)
{
  return s ? s : "(null)";
}

static const char *
nonempty (const char *s)
{
  return (s && !*s) ? "(empty)" : nonnull (s);
}

void
lt_setenv (const char *name, const char *value)
{
  lt_debugprintf (__FILE__, __LINE__,
		  "(lt_setenv) setting '%s' to '%s'\n",
                  nonnull (name), nonnull (value));
  {
#ifdef HAVE_SETENV
    /* always make a copy, for consistency with !HAVE_SETENV */
    char *str = xstrdup (value);
    setenv (name, str, 1);
#else
    size_t len = strlen (name) + 1 + strlen (value) + 1;
    char *str = XMALLOC (char, len);
    sprintf (str, "%s=%s", name, value);
    if (putenv (str) != EXIT_SUCCESS)
      {
        XFREE (str);
      }
#endif
  }
}

char *
lt_extend_str (const char *orig_value, const char *add, int to_end)
{
  char *new_value;
  if (orig_value && *orig_value)
    {
      size_t orig_value_len = strlen (orig_value);
      size_t add_len = strlen (add);
      new_value = XMALLOC (char, add_len + orig_value_len + 1);
      if (to_end)
        {
          strcpy (new_value, orig_value);
          strcpy (new_value + orig_value_len, add);
        }
      else
        {
          strcpy (new_value, add);
          strcpy (new_value + add_len, orig_value);
        }
    }
  else
    {
      new_value = xstrdup (add);
    }
  return new_value;
}

void
lt_update_exe_path (const char *name, const char *value)
{
  lt_debugprintf (__FILE__, __LINE__,
		  "(lt_update_exe_path) modifying '%s' by prepending '%s'\n",
                  nonnull (name), nonnull (value));

  if (name && *name && value && *value)
    {
      char *new_value = lt_extend_str (getenv (name), value, 0);
      /* some systems can't cope with a ':'-terminated path #' */
      size_t len = strlen (new_value);
      while ((len > 0) && IS_PATH_SEPARATOR (new_value[len-1]))
        {
          new_value[--len] = '\0';
        }
      lt_setenv (name, new_value);
      XFREE (new_value);
    }
}

void
lt_update_lib_path (const char *name, const char *value)
{
  lt_debugprintf (__FILE__, __LINE__,
		  "(lt_update_lib_path) modifying '%s' by prepending '%s'\n",
                  nonnull (name), nonnull (value));

  if (name && *name && value && *value)
    {
      char *new_value = lt_extend_str (getenv (name), value, 0);
      lt_setenv (name, new_value);
      XFREE (new_value);
    }
}

EOF
	    case $host_os in
	      mingw*)
		cat <<"EOF"

/* Prepares an argument vector before calling spawn().
   Note that spawn() does not by itself call the command interpreter
     (getenv ("COMSPEC") != NULL ? getenv ("COMSPEC") :
      ({ OSVERSIONINFO v; v.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
         GetVersionEx(&v);
         v.dwPlatformId == VER_PLATFORM_WIN32_NT;
      }) ? "cmd.exe" : "command.com").
   Instead it simply concatenates the arguments, separated by ' ', and calls
   CreateProcess().  We must quote the arguments since Win32 CreateProcess()
   interprets characters like ' ', '\t', '\\', '"' (but not '<' and '>') in a
   special way:
   - Space and tab are interpreted as delimiters. They are not treated as
     delimiters if they are surrounded by double quotes: "...".
   - Unescaped double quotes are removed from the input. Their only effect is
     that within double quotes, space and tab are treated like normal
     characters.
   - Backslashes not followed by double quotes are not special.
   - But 2*n+1 backslashes followed by a double quote become
     n backslashes followed by a double quote (n >= 0):
       \" -> "
       \\\" -> \"
       \\\\\" -> \\"
 */
#define SHELL_SPECIAL_CHARS "\"\\ \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
#define SHELL_SPACE_CHARS " \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
char **
prepare_spawn (char **argv)
{
  size_t argc;
  char **new_argv;
  size_t i;

  /* Count number of arguments.  */
  for (argc = 0; argv[argc] != NULL; argc++)
    ;

  /* Allocate new argument vector.  */
  new_argv = XMALLOC (char *, argc + 1);

  /* Put quoted arguments into the new argument vector.  */
  for (i = 0; i < argc; i++)
    {
      const char *string = argv[i];

      if (string[0] == '\0')
	new_argv[i] = xstrdup ("\"\"");
      else if (strpbrk (string, SHELL_SPECIAL_CHARS) != NULL)
	{
	  int quote_around = (strpbrk (string, SHELL_SPACE_CHARS) != NULL);
	  size_t length;
	  unsigned int backslashes;
	  const char *s;
	  char *quoted_string;
	  char *p;

	  length = 0;
	  backslashes = 0;
	  if (quote_around)
	    length++;
	  for (s = string; *s != '\0'; s++)
	    {
	      char c = *s;
	      if (c == '"')
		length += backslashes + 1;
	      length++;
	      if (c == '\\')
		backslashes++;
	      else
		backslashes = 0;
	    }
	  if (quote_around)
	    length += backslashes + 1;

	  quoted_string = XMALLOC (char, length + 1);

	  p = quoted_string;
	  backslashes = 0;
	  if (quote_around)
	    *p++ = '"';
	  for (s = string; *s != '\0'; s++)
	    {
	      char c = *s;
	      if (c == '"')
		{
		  unsigned int j;
		  for (j = backslashes + 1; j > 0; j--)
		    *p++ = '\\';
		}
	      *p++ = c;
	      if (c == '\\')
		backslashes++;
	      else
		backslashes = 0;
	    }
	  if (quote_around)
	    {
	      unsigned int j;
	      for (j = backslashes; j > 0; j--)
		*p++ = '\\';
	      *p++ = '"';
	    }
	  *p = '\0';

	  new_argv[i] = quoted_string;
	}
      else
	new_argv[i] = (char *) string;
    }
  new_argv[argc] = NULL;

  return new_argv;
}
EOF
		;;
	    esac

            cat <<"EOF"
void lt_dump_script (FILE* f)
{
EOF
	    func_emit_wrapper yes |
	      $SED -n -e '
s/^\(.\{79\}\)\(..*\)/\1\
\2/
h
s/\([\\"]\)/\\\1/g
s/$/\\n/
s/\([^\n]*\).*/  fputs ("\1", f);/p
g
D'
            cat <<"EOF"
}
EOF
}
# end: func_emit_cwrapperexe_src

# func_win32_import_lib_p ARG
# True if ARG is an import lib, as indicated by $file_magic_cmd
func_win32_import_lib_p ()
{
    $debug_cmd

    case `eval $file_magic_cmd \"\$1\" 2>/dev/null | $SED -e 10q` in
    *import*) : ;;
    *) false ;;
    esac
}

# func_suncc_cstd_abi
# !!ONLY CALL THIS FOR SUN CC AFTER $compile_command IS FULLY EXPANDED!!
# Several compiler flags select an ABI that is incompatible with the
# Cstd library. Avoid specifying it if any are in CXXFLAGS.
func_suncc_cstd_abi ()
{
    $debug_cmd

    case " $compile_command " in
    *" -compat=g "*|*\ -std=c++[0-9][0-9]\ *|*" -library=stdcxx4 "*|*" -library=stlport4 "*)
      suncc_use_cstd_abi=no
      ;;
    *)
      suncc_use_cstd_abi=yes
      ;;
    esac
}

# func_mode_link arg...
func_mode_link ()
{
    $debug_cmd

    case $host in
    *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
      # It is impossible to link a dll without this setting, and
      # we shouldn't force the makefile maintainer to figure out
      # what system we are compiling for in order to pass an extra
      # flag for every libtool invocation.
      # allow_undefined=no

      # FIXME: Unfortunately, there are problems with the above when trying
      # to make a dll that has undefined symbols, in which case not
      # even a static library is built.  For now, we need to specify
      # -no-undefined on the libtool link line when we can be certain
      # that all symbols are satisfied, otherwise we get a static library.
      allow_undefined=yes
      ;;
    *)
      allow_undefined=yes
      ;;
    esac
    libtool_args=$nonopt
    base_compile="$nonopt $@"
    compile_command=$nonopt
    finalize_command=$nonopt

    compile_rpath=
    finalize_rpath=
    compile_shlibpath=
    finalize_shlibpath=
    convenience=
    old_convenience=
    deplibs=
    old_deplibs=
    compiler_flags=
    linker_flags=
    dllsearchpath=
    lib_search_path=`pwd`
    inst_prefix_dir=
    new_inherited_linker_flags=

    avoid_version=no
    bindir=
    dlfiles=
    dlprefiles=
    dlself=no
    export_dynamic=no
    export_symbols=
    export_symbols_regex=
    generated=
    libobjs=
    ltlibs=
    module=no
    no_install=no
    objs=
    os2dllname=
    non_pic_objects=
    precious_files_regex=
    prefer_static_libs=no
    preload=false
    prev=
    prevarg=
    release=
    rpath=
    xrpath=
    perm_rpath=
    temp_rpath=
    thread_safe=no
    vinfo=
    vinfo_number=no
    weak_libs=
    single_module=$wl-single_module
    func_infer_tag $base_compile

    # We need to know -static, to get the right output filenames.
    for arg
    do
      case $arg in
      -shared)
	test yes != "$build_libtool_libs" \
	  && func_fatal_configuration "cannot build a shared library"
	build_old_libs=no
	break
	;;
      -all-static | -static | -static-libtool-libs)
	case $arg in
	-all-static)
	  if test yes = "$build_libtool_libs" && test -z "$link_static_flag"; then
	    func_warning "complete static linking is impossible in this configuration"
	  fi
	  if test -n "$link_static_flag"; then
	    dlopen_self=$dlopen_self_static
	  fi
	  prefer_static_libs=yes
	  ;;
	-static)
	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
	    dlopen_self=$dlopen_self_static
	  fi
	  prefer_static_libs=built
	  ;;
	-static-libtool-libs)
	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
	    dlopen_self=$dlopen_self_static
	  fi
	  prefer_static_libs=yes
	  ;;
	esac
	build_libtool_libs=no
	build_old_libs=yes
	break
	;;
      esac
    done

    # See if our shared archives depend on static archives.
    test -n "$old_archive_from_new_cmds" && build_old_libs=yes

    # Go through the arguments, transforming them on the way.
    while test "$#" -gt 0; do
      arg=$1
      shift
      func_quote_for_eval "$arg"
      qarg=$func_quote_for_eval_unquoted_result
      func_append libtool_args " $func_quote_for_eval_result"

      # If the previous option needs an argument, assign it.
      if test -n "$prev"; then
	case $prev in
	output)
	  func_append compile_command " @OUTPUT@"
	  func_append finalize_command " @OUTPUT@"
	  ;;
	esac

	case $prev in
	bindir)
	  bindir=$arg
	  prev=
	  continue
	  ;;
	dlfiles|dlprefiles)
	  $preload || {
	    # Add the symbol object into the linking commands.
	    func_append compile_command " @SYMFILE@"
	    func_append finalize_command " @SYMFILE@"
	    preload=:
	  }
	  case $arg in
	  *.la | *.lo) ;;  # We handle these cases below.
	  force)
	    if test no = "$dlself"; then
	      dlself=needless
	      export_dynamic=yes
	    fi
	    prev=
	    continue
	    ;;
	  self)
	    if test dlprefiles = "$prev"; then
	      dlself=yes
	    elif test dlfiles = "$prev" && test yes != "$dlopen_self"; then
	      dlself=yes
	    else
	      dlself=needless
	      export_dynamic=yes
	    fi
	    prev=
	    continue
	    ;;
	  *)
	    if test dlfiles = "$prev"; then
	      func_append dlfiles " $arg"
	    else
	      func_append dlprefiles " $arg"
	    fi
	    prev=
	    continue
	    ;;
	  esac
	  ;;
	expsyms)
	  export_symbols=$arg
	  test -f "$arg" \
	    || func_fatal_error "symbol file '$arg' does not exist"
	  prev=
	  continue
	  ;;
	expsyms_regex)
	  export_symbols_regex=$arg
	  prev=
	  continue
	  ;;
	framework)
	  case $host in
	    *-*-darwin*)
	      case "$deplibs " in
		*" $qarg.ltframework "*) ;;
		*) func_append deplibs " $qarg.ltframework" # this is fixed later
		   ;;
	      esac
	      ;;
	  esac
	  prev=
	  continue
	  ;;
	inst_prefix)
	  inst_prefix_dir=$arg
	  prev=
	  continue
	  ;;
	mllvm)
	  # Clang does not use LLVM to link, so we can simply discard any
	  # '-mllvm $arg' options when doing the link step.
	  prev=
	  continue
	  ;;
	objectlist)
	  if test -f "$arg"; then
	    save_arg=$arg
	    moreargs=
	    for fil in `cat "$save_arg"`
	    do
#	      func_append moreargs " $fil"
	      arg=$fil
	      # A libtool-controlled object.

	      # Check to see that this really is a libtool object.
	      if func_lalib_unsafe_p "$arg"; then
		pic_object=
		non_pic_object=

		# Read the .lo file
		func_source "$arg"

		if test -z "$pic_object" ||
		   test -z "$non_pic_object" ||
		   test none = "$pic_object" &&
		   test none = "$non_pic_object"; then
		  func_fatal_error "cannot find name of object for '$arg'"
		fi

		# Extract subdirectory from the argument.
		func_dirname "$arg" "/" ""
		xdir=$func_dirname_result

		if test none != "$pic_object"; then
		  # Prepend the subdirectory the object is found in.
		  pic_object=$xdir$pic_object

		  if test dlfiles = "$prev"; then
		    if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then
		      func_append dlfiles " $pic_object"
		      prev=
		      continue
		    else
		      # If libtool objects are unsupported, then we need to preload.
		      prev=dlprefiles
		    fi
		  fi

		  # CHECK ME:  I think I busted this.  -Ossama
		  if test dlprefiles = "$prev"; then
		    # Preload the old-style object.
		    func_append dlprefiles " $pic_object"
		    prev=
		  fi

		  # A PIC object.
		  func_append libobjs " $pic_object"
		  arg=$pic_object
		fi

		# Non-PIC object.
		if test none != "$non_pic_object"; then
		  # Prepend the subdirectory the object is found in.
		  non_pic_object=$xdir$non_pic_object

		  # A standard non-PIC object
		  func_append non_pic_objects " $non_pic_object"
		  if test -z "$pic_object" || test none = "$pic_object"; then
		    arg=$non_pic_object
		  fi
		else
		  # If the PIC object exists, use it instead.
		  # $xdir was prepended to $pic_object above.
		  non_pic_object=$pic_object
		  func_append non_pic_objects " $non_pic_object"
		fi
	      else
		# Only an error if not doing a dry-run.
		if $opt_dry_run; then
		  # Extract subdirectory from the argument.
		  func_dirname "$arg" "/" ""
		  xdir=$func_dirname_result

		  func_lo2o "$arg"
		  pic_object=$xdir$objdir/$func_lo2o_result
		  non_pic_object=$xdir$func_lo2o_result
		  func_append libobjs " $pic_object"
		  func_append non_pic_objects " $non_pic_object"
	        else
		  func_fatal_error "'$arg' is not a valid libtool object"
		fi
	      fi
	    done
	  else
	    func_fatal_error "link input file '$arg' does not exist"
	  fi
	  arg=$save_arg
	  prev=
	  continue
	  ;;
	os2dllname)
	  os2dllname=$arg
	  prev=
	  continue
	  ;;
	precious_regex)
	  precious_files_regex=$arg
	  prev=
	  continue
	  ;;
	release)
	  release=-$arg
	  prev=
	  continue
	  ;;
	rpath | xrpath)
	  # We need an absolute path.
	  case $arg in
	  [\\/]* | [A-Za-z]:[\\/]*) ;;
	  *)
	    func_fatal_error "only absolute run-paths are allowed"
	    ;;
	  esac
	  if test rpath = "$prev"; then
	    case "$rpath " in
	    *" $arg "*) ;;
	    *) func_append rpath " $arg" ;;
	    esac
	  else
	    case "$xrpath " in
	    *" $arg "*) ;;
	    *) func_append xrpath " $arg" ;;
	    esac
	  fi
	  prev=
	  continue
	  ;;
	shrext)
	  shrext_cmds=$arg
	  prev=
	  continue
	  ;;
	weak)
	  func_append weak_libs " $arg"
	  prev=
	  continue
	  ;;
	xcclinker)
	  func_append linker_flags " $qarg"
	  func_append compiler_flags " $qarg"
	  prev=
	  func_append compile_command " $qarg"
	  func_append finalize_command " $qarg"
	  continue
	  ;;
	xcompiler)
	  func_append compiler_flags " $qarg"
	  prev=
	  func_append compile_command " $qarg"
	  func_append finalize_command " $qarg"
	  continue
	  ;;
	xlinker)
	  func_append linker_flags " $qarg"
	  func_append compiler_flags " $wl$qarg"
	  prev=
	  func_append compile_command " $wl$qarg"
	  func_append finalize_command " $wl$qarg"
	  continue
	  ;;
	*)
	  eval "$prev=\"\$arg\""
	  prev=
	  continue
	  ;;
	esac
      fi # test -n "$prev"

      prevarg=$arg

      case $arg in
      -all-static)
	if test -n "$link_static_flag"; then
	  # See comment for -static flag below, for more details.
	  func_append compile_command " $link_static_flag"
	  func_append finalize_command " $link_static_flag"
	fi
	continue
	;;

      -allow-undefined)
	# FIXME: remove this flag sometime in the future.
	func_fatal_error "'-allow-undefined' must not be used because it is the default"
	;;

      -avoid-version)
	avoid_version=yes
	continue
	;;

      -bindir)
	prev=bindir
	continue
	;;

      -dlopen)
	prev=dlfiles
	continue
	;;

      -dlpreopen)
	prev=dlprefiles
	continue
	;;

      -export-dynamic)
	export_dynamic=yes
	continue
	;;

      -export-symbols | -export-symbols-regex)
	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
	  func_fatal_error "more than one -exported-symbols argument is not allowed"
	fi
	if test X-export-symbols = "X$arg"; then
	  prev=expsyms
	else
	  prev=expsyms_regex
	fi
	continue
	;;

      -framework)
	prev=framework
	continue
	;;

      -inst-prefix-dir)
	prev=inst_prefix
	continue
	;;

      # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:*
      # so, if we see these flags be careful not to treat them like -L
      -L[A-Z][A-Z]*:*)
	case $with_gcc/$host in
	no/*-*-irix* | /*-*-irix*)
	  func_append compile_command " $arg"
	  func_append finalize_command " $arg"
	  ;;
	esac
	continue
	;;

      -L*)
	func_stripname "-L" '' "$arg"
	if test -z "$func_stripname_result"; then
	  if test "$#" -gt 0; then
	    func_fatal_error "require no space between '-L' and '$1'"
	  else
	    func_fatal_error "need path for '-L' option"
	  fi
	fi
	func_resolve_sysroot "$func_stripname_result"
	dir=$func_resolve_sysroot_result
	# We need an absolute path.
	case $dir in
	[\\/]* | [A-Za-z]:[\\/]*) ;;
	*)
	  absdir=`cd "$dir" && pwd`
	  test -z "$absdir" && \
	    func_fatal_error "cannot determine absolute directory name of '$dir'"
	  dir=$absdir
	  ;;
	esac
	case "$deplibs " in
	*" -L$dir "* | *" $arg "*)
	  # Will only happen for absolute or sysroot arguments
	  ;;
	*)
	  # Preserve sysroot, but never include relative directories
	  case $dir in
	    [\\/]* | [A-Za-z]:[\\/]* | =*) func_append deplibs " $arg" ;;
	    *) func_append deplibs " -L$dir" ;;
	  esac
	  func_append lib_search_path " $dir"
	  ;;
	esac
	case $host in
	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
	  testbindir=`$ECHO "$dir" | $SED 's*/lib$*/bin*'`
	  case :$dllsearchpath: in
	  *":$dir:"*) ;;
	  ::) dllsearchpath=$dir;;
	  *) func_append dllsearchpath ":$dir";;
	  esac
	  case :$dllsearchpath: in
	  *":$testbindir:"*) ;;
	  ::) dllsearchpath=$testbindir;;
	  *) func_append dllsearchpath ":$testbindir";;
	  esac
	  ;;
	esac
	continue
	;;

      -l*)
	if test X-lc = "X$arg" || test X-lm = "X$arg"; then
	  case $host in
	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-beos* | *-cegcc* | *-*-haiku*)
	    # These systems don't actually have a C or math library (as such)
	    continue
	    ;;
	  *-*-os2*)
	    # These systems don't actually have a C library (as such)
	    test X-lc = "X$arg" && continue
	    ;;
	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
	    # Do not include libc due to us having libc/libc_r.
	    test X-lc = "X$arg" && continue
	    ;;
	  *-*-rhapsody* | *-*-darwin1.[012])
	    # Rhapsody C and math libraries are in the System framework
	    func_append deplibs " System.ltframework"
	    continue
	    ;;
	  *-*-sco3.2v5* | *-*-sco5v6*)
	    # Causes problems with __ctype
	    test X-lc = "X$arg" && continue
	    ;;
	  *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*)
	    # Compiler inserts libc in the correct place for threads to work
	    test X-lc = "X$arg" && continue
	    ;;
	  esac
	elif test X-lc_r = "X$arg"; then
	 case $host in
	 *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*)
	   # Do not include libc_r directly, use -pthread flag.
	   continue
	   ;;
	 esac
	fi
	func_append deplibs " $arg"
	continue
	;;

      -mllvm)
	prev=mllvm
	continue
	;;

      -module)
	module=yes
	continue
	;;

      # Tru64 UNIX uses -model [arg] to determine the layout of C++
      # classes, name mangling, and exception handling.
      # Darwin uses the -arch flag to determine output architecture.
      -model|-arch|-isysroot|--sysroot)
	func_append compiler_flags " $arg"
	func_append compile_command " $arg"
	func_append finalize_command " $arg"
	prev=xcompiler
	continue
	;;

      -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
      |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
	func_append compiler_flags " $arg"
	func_append compile_command " $arg"
	func_append finalize_command " $arg"
	case "$new_inherited_linker_flags " in
	    *" $arg "*) ;;
	    * ) func_append new_inherited_linker_flags " $arg" ;;
	esac
	continue
	;;

      -multi_module)
	single_module=$wl-multi_module
	continue
	;;

      -no-fast-install)
	fast_install=no
	continue
	;;

      -no-install)
	case $host in
	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-darwin* | *-cegcc*)
	  # The PATH hackery in wrapper scripts is required on Windows
	  # and Darwin in order for the loader to find any dlls it needs.
	  func_warning "'-no-install' is ignored for $host"
	  func_warning "assuming '-no-fast-install' instead"
	  fast_install=no
	  ;;
	*) no_install=yes ;;
	esac
	continue
	;;

      -no-undefined)
	allow_undefined=no
	continue
	;;

      -objectlist)
	prev=objectlist
	continue
	;;

      -os2dllname)
	prev=os2dllname
	continue
	;;

      -o) prev=output ;;

      -precious-files-regex)
	prev=precious_regex
	continue
	;;

      -release)
	prev=release
	continue
	;;

      -rpath)
	prev=rpath
	continue
	;;

      -R)
	prev=xrpath
	continue
	;;

      -R*)
	func_stripname '-R' '' "$arg"
	dir=$func_stripname_result
	# We need an absolute path.
	case $dir in
	[\\/]* | [A-Za-z]:[\\/]*) ;;
	=*)
	  func_stripname '=' '' "$dir"
	  dir=$lt_sysroot$func_stripname_result
	  ;;
	*)
	  func_fatal_error "only absolute run-paths are allowed"
	  ;;
	esac
	case "$xrpath " in
	*" $dir "*) ;;
	*) func_append xrpath " $dir" ;;
	esac
	continue
	;;

      -shared)
	# The effects of -shared are defined in a previous loop.
	continue
	;;

      -shrext)
	prev=shrext
	continue
	;;

      -static | -static-libtool-libs)
	# The effects of -static are defined in a previous loop.
	# We used to do the same as -all-static on platforms that
	# didn't have a PIC flag, but the assumption that the effects
	# would be equivalent was wrong.  It would break on at least
	# Digital Unix and AIX.
	continue
	;;

      -thread-safe)
	thread_safe=yes
	continue
	;;

      -version-info)
	prev=vinfo
	continue
	;;

      -version-number)
	prev=vinfo
	vinfo_number=yes
	continue
	;;

      -weak)
        prev=weak
	continue
	;;

      -Wc,*)
	func_stripname '-Wc,' '' "$arg"
	args=$func_stripname_result
	arg=
	save_ifs=$IFS; IFS=,
	for flag in $args; do
	  IFS=$save_ifs
          func_quote_for_eval "$flag"
	  func_append arg " $func_quote_for_eval_result"
	  func_append compiler_flags " $func_quote_for_eval_result"
	done
	IFS=$save_ifs
	func_stripname ' ' '' "$arg"
	arg=$func_stripname_result
	;;

      -Wl,*)
	func_stripname '-Wl,' '' "$arg"
	args=$func_stripname_result
	arg=
	save_ifs=$IFS; IFS=,
	for flag in $args; do
	  IFS=$save_ifs
          func_quote_for_eval "$flag"
	  func_append arg " $wl$func_quote_for_eval_result"
	  func_append compiler_flags " $wl$func_quote_for_eval_result"
	  func_append linker_flags " $func_quote_for_eval_result"
	done
	IFS=$save_ifs
	func_stripname ' ' '' "$arg"
	arg=$func_stripname_result
	;;

      -Xcompiler)
	prev=xcompiler
	continue
	;;

      -Xlinker)
	prev=xlinker
	continue
	;;

      -XCClinker)
	prev=xcclinker
	continue
	;;

      # -msg_* for osf cc
      -msg_*)
	func_quote_for_eval "$arg"
	arg=$func_quote_for_eval_result
	;;

      # Flags to be passed through unchanged, with rationale:
      # -64, -mips[0-9]      enable 64-bit mode for the SGI compiler
      # -r[0-9][0-9]*        specify processor for the SGI compiler
      # -xarch=*, -xtarget=* enable 64-bit mode for the Sun compiler
      # +DA*, +DD*           enable 64-bit mode for the HP compiler
      # -q*                  compiler args for the IBM compiler
      # -m*, -t[45]*, -txscale* architecture-specific flags for GCC
      # -F/path              path to uninstalled frameworks, gcc on darwin
      # -p, -pg, --coverage, -fprofile-*  profiling flags for GCC
      # -fstack-protector*   stack protector flags for GCC
      # @file                GCC response files
      # -tp=*                Portland pgcc target processor selection
      # --sysroot=*          for sysroot support
      # -O*, -g*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization
      # -specs=*             GCC specs files
      # -stdlib=*            select c++ std lib with clang
      # -fsanitize=*         Clang/GCC memory and address sanitizer
      -64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \
      -t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \
      -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*| \
      -specs=*|-fsanitize=*)
        func_quote_for_eval "$arg"
	arg=$func_quote_for_eval_result
        func_append compile_command " $arg"
        func_append finalize_command " $arg"
        func_append compiler_flags " $arg"
        continue
        ;;

      -Z*)
        if test os2 = "`expr $host : '.*\(os2\)'`"; then
          # OS/2 uses -Zxxx to specify OS/2-specific options
	  compiler_flags="$compiler_flags $arg"
	  func_append compile_command " $arg"
	  func_append finalize_command " $arg"
	  case $arg in
	  -Zlinker | -Zstack)
	    prev=xcompiler
	    ;;
	  esac
	  continue
        else
	  # Otherwise treat like 'Some other compiler flag' below
	  func_quote_for_eval "$arg"
	  arg=$func_quote_for_eval_result
        fi
	;;

      # Some other compiler flag.
      -* | +*)
        func_quote_for_eval "$arg"
	arg=$func_quote_for_eval_result
	;;

      *.$objext)
	# A standard object.
	func_append objs " $arg"
	;;

      *.lo)
	# A libtool-controlled object.

	# Check to see that this really is a libtool object.
	if func_lalib_unsafe_p "$arg"; then
	  pic_object=
	  non_pic_object=

	  # Read the .lo file
	  func_source "$arg"

	  if test -z "$pic_object" ||
	     test -z "$non_pic_object" ||
	     test none = "$pic_object" &&
	     test none = "$non_pic_object"; then
	    func_fatal_error "cannot find name of object for '$arg'"
	  fi

	  # Extract subdirectory from the argument.
	  func_dirname "$arg" "/" ""
	  xdir=$func_dirname_result

	  test none = "$pic_object" || {
	    # Prepend the subdirectory the object is found in.
	    pic_object=$xdir$pic_object

	    if test dlfiles = "$prev"; then
	      if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then
		func_append dlfiles " $pic_object"
		prev=
		continue
	      else
		# If libtool objects are unsupported, then we need to preload.
		prev=dlprefiles
	      fi
	    fi

	    # CHECK ME:  I think I busted this.  -Ossama
	    if test dlprefiles = "$prev"; then
	      # Preload the old-style object.
	      func_append dlprefiles " $pic_object"
	      prev=
	    fi

	    # A PIC object.
	    func_append libobjs " $pic_object"
	    arg=$pic_object
	  }

	  # Non-PIC object.
	  if test none != "$non_pic_object"; then
	    # Prepend the subdirectory the object is found in.
	    non_pic_object=$xdir$non_pic_object

	    # A standard non-PIC object
	    func_append non_pic_objects " $non_pic_object"
	    if test -z "$pic_object" || test none = "$pic_object"; then
	      arg=$non_pic_object
	    fi
	  else
	    # If the PIC object exists, use it instead.
	    # $xdir was prepended to $pic_object above.
	    non_pic_object=$pic_object
	    func_append non_pic_objects " $non_pic_object"
	  fi
	else
	  # Only an error if not doing a dry-run.
	  if $opt_dry_run; then
	    # Extract subdirectory from the argument.
	    func_dirname "$arg" "/" ""
	    xdir=$func_dirname_result

	    func_lo2o "$arg"
	    pic_object=$xdir$objdir/$func_lo2o_result
	    non_pic_object=$xdir$func_lo2o_result
	    func_append libobjs " $pic_object"
	    func_append non_pic_objects " $non_pic_object"
	  else
	    func_fatal_error "'$arg' is not a valid libtool object"
	  fi
	fi
	;;

      *.$libext)
	# An archive.
	func_append deplibs " $arg"
	func_append old_deplibs " $arg"
	continue
	;;

      *.la)
	# A libtool-controlled library.

	func_resolve_sysroot "$arg"
	if test dlfiles = "$prev"; then
	  # This library was specified with -dlopen.
	  func_append dlfiles " $func_resolve_sysroot_result"
	  prev=
	elif test dlprefiles = "$prev"; then
	  # The library was specified with -dlpreopen.
	  func_append dlprefiles " $func_resolve_sysroot_result"
	  prev=
	else
	  func_append deplibs " $func_resolve_sysroot_result"
	fi
	continue
	;;

      # Some other compiler argument.
      *)
	# Unknown arguments in both finalize_command and compile_command need
	# to be aesthetically quoted because they are evaled later.
	func_quote_for_eval "$arg"
	arg=$func_quote_for_eval_result
	;;
      esac # arg

      # Now actually substitute the argument into the commands.
      if test -n "$arg"; then
	func_append compile_command " $arg"
	func_append finalize_command " $arg"
      fi
    done # argument parsing loop

    test -n "$prev" && \
      func_fatal_help "the '$prevarg' option requires an argument"

    if test yes = "$export_dynamic" && test -n "$export_dynamic_flag_spec"; then
      eval arg=\"$export_dynamic_flag_spec\"
      func_append compile_command " $arg"
      func_append finalize_command " $arg"
    fi

    oldlibs=
    # calculate the name of the file, without its directory
    func_basename "$output"
    outputname=$func_basename_result
    libobjs_save=$libobjs

    if test -n "$shlibpath_var"; then
      # get the directories listed in $shlibpath_var
      eval shlib_search_path=\`\$ECHO \"\$$shlibpath_var\" \| \$SED \'s/:/ /g\'\`
    else
      shlib_search_path=
    fi
    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"

    # Definition is injected by LT_CONFIG during libtool generation.
    func_munge_path_list sys_lib_dlsearch_path "$LT_SYS_LIBRARY_PATH"

    func_dirname "$output" "/" ""
    output_objdir=$func_dirname_result$objdir
    func_to_tool_file "$output_objdir/"
    tool_output_objdir=$func_to_tool_file_result
    # Create the object directory.
    func_mkdir_p "$output_objdir"

    # Determine the type of output
    case $output in
    "")
      func_fatal_help "you must specify an output file"
      ;;
    *.$libext) linkmode=oldlib ;;
    *.lo | *.$objext) linkmode=obj ;;
    *.la) linkmode=lib ;;
    *) linkmode=prog ;; # Anything else should be a program.
    esac

    specialdeplibs=

    libs=
    # Find all interdependent deplibs by searching for libraries
    # that are linked more than once (e.g. -la -lb -la)
    for deplib in $deplibs; do
      if $opt_preserve_dup_deps; then
	case "$libs " in
	*" $deplib "*) func_append specialdeplibs " $deplib" ;;
	esac
      fi
      func_append libs " $deplib"
    done

    if test lib = "$linkmode"; then
      libs="$predeps $libs $compiler_lib_search_path $postdeps"

      # Compute libraries that are listed more than once in $predeps
      # $postdeps and mark them as special (i.e., whose duplicates are
      # not to be eliminated).
      pre_post_deps=
      if $opt_duplicate_compiler_generated_deps; then
	for pre_post_dep in $predeps $postdeps; do
	  case "$pre_post_deps " in
	  *" $pre_post_dep "*) func_append specialdeplibs " $pre_post_deps" ;;
	  esac
	  func_append pre_post_deps " $pre_post_dep"
	done
      fi
      pre_post_deps=
    fi

    deplibs=
    newdependency_libs=
    newlib_search_path=
    need_relink=no # whether we're linking any uninstalled libtool libraries
    notinst_deplibs= # not-installed libtool libraries
    notinst_path= # paths that contain not-installed libtool libraries

    case $linkmode in
    lib)
	passes="conv dlpreopen link"
	for file in $dlfiles $dlprefiles; do
	  case $file in
	  *.la) ;;
	  *)
	    func_fatal_help "libraries can '-dlopen' only libtool libraries: $file"
	    ;;
	  esac
	done
	;;
    prog)
	compile_deplibs=
	finalize_deplibs=
	alldeplibs=false
	newdlfiles=
	newdlprefiles=
	passes="conv scan dlopen dlpreopen link"
	;;
    *)  passes="conv"
	;;
    esac

    for pass in $passes; do
      # The preopen pass in lib mode reverses $deplibs; put it back here
      # so that -L comes before libs that need it for instance...
      if test lib,link = "$linkmode,$pass"; then
	## FIXME: Find the place where the list is rebuilt in the wrong
	##        order, and fix it there properly
        tmp_deplibs=
	for deplib in $deplibs; do
	  tmp_deplibs="$deplib $tmp_deplibs"
	done
	deplibs=$tmp_deplibs
      fi

      if test lib,link = "$linkmode,$pass" ||
	 test prog,scan = "$linkmode,$pass"; then
	libs=$deplibs
	deplibs=
      fi
      if test prog = "$linkmode"; then
	case $pass in
	dlopen) libs=$dlfiles ;;
	dlpreopen) libs=$dlprefiles ;;
	link)
	  libs="$deplibs %DEPLIBS%"
	  test "X$link_all_deplibs" != Xno && libs="$libs $dependency_libs"
	  ;;
	esac
      fi
      if test lib,dlpreopen = "$linkmode,$pass"; then
	# Collect and forward deplibs of preopened libtool libs
	for lib in $dlprefiles; do
	  # Ignore non-libtool-libs
	  dependency_libs=
	  func_resolve_sysroot "$lib"
	  case $lib in
	  *.la)	func_source "$func_resolve_sysroot_result" ;;
	  esac

	  # Collect preopened libtool deplibs, except any this library
	  # has declared as weak libs
	  for deplib in $dependency_libs; do
	    func_basename "$deplib"
            deplib_base=$func_basename_result
	    case " $weak_libs " in
	    *" $deplib_base "*) ;;
	    *) func_append deplibs " $deplib" ;;
	    esac
	  done
	done
	libs=$dlprefiles
      fi
      if test dlopen = "$pass"; then
	# Collect dlpreopened libraries
	save_deplibs=$deplibs
	deplibs=
      fi

      for deplib in $libs; do
	lib=
	found=false
	case $deplib in
	-mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
        |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
	  if test prog,link = "$linkmode,$pass"; then
	    compile_deplibs="$deplib $compile_deplibs"
	    finalize_deplibs="$deplib $finalize_deplibs"
	  else
	    func_append compiler_flags " $deplib"
	    if test lib = "$linkmode"; then
		case "$new_inherited_linker_flags " in
		    *" $deplib "*) ;;
		    * ) func_append new_inherited_linker_flags " $deplib" ;;
		esac
	    fi
	  fi
	  continue
	  ;;
	-l*)
	  if test lib != "$linkmode" && test prog != "$linkmode"; then
	    func_warning "'-l' is ignored for archives/objects"
	    continue
	  fi
	  func_stripname '-l' '' "$deplib"
	  name=$func_stripname_result
	  if test lib = "$linkmode"; then
	    searchdirs="$newlib_search_path $lib_search_path $compiler_lib_search_dirs $sys_lib_search_path $shlib_search_path"
	  else
	    searchdirs="$newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path"
	  fi
	  for searchdir in $searchdirs; do
	    for search_ext in .la $std_shrext .so .a; do
	      # Search the libtool library
	      lib=$searchdir/lib$name$search_ext
	      if test -f "$lib"; then
		if test .la = "$search_ext"; then
		  found=:
		else
		  found=false
		fi
		break 2
	      fi
	    done
	  done
	  if $found; then
	    # deplib is a libtool library
	    # If $allow_libtool_libs_with_static_runtimes && $deplib is a stdlib,
	    # We need to do some special things here, and not later.
	    if test yes = "$allow_libtool_libs_with_static_runtimes"; then
	      case " $predeps $postdeps " in
	      *" $deplib "*)
		if func_lalib_p "$lib"; then
		  library_names=
		  old_library=
		  func_source "$lib"
		  for l in $old_library $library_names; do
		    ll=$l
		  done
		  if test "X$ll" = "X$old_library"; then # only static version available
		    found=false
		    func_dirname "$lib" "" "."
		    ladir=$func_dirname_result
		    lib=$ladir/$old_library
		    if test prog,link = "$linkmode,$pass"; then
		      compile_deplibs="$deplib $compile_deplibs"
		      finalize_deplibs="$deplib $finalize_deplibs"
		    else
		      deplibs="$deplib $deplibs"
		      test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs"
		    fi
		    continue
		  fi
		fi
		;;
	      *) ;;
	      esac
	    fi
	  else
	    # deplib doesn't seem to be a libtool library
	    if test prog,link = "$linkmode,$pass"; then
	      compile_deplibs="$deplib $compile_deplibs"
	      finalize_deplibs="$deplib $finalize_deplibs"
	    else
	      deplibs="$deplib $deplibs"
	      test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs"
	    fi
	    continue
	  fi
	  ;; # -l
	*.ltframework)
	  if test prog,link = "$linkmode,$pass"; then
	    compile_deplibs="$deplib $compile_deplibs"
	    finalize_deplibs="$deplib $finalize_deplibs"
	  else
	    deplibs="$deplib $deplibs"
	    if test lib = "$linkmode"; then
		case "$new_inherited_linker_flags " in
		    *" $deplib "*) ;;
		    * ) func_append new_inherited_linker_flags " $deplib" ;;
		esac
	    fi
	  fi
	  continue
	  ;;
	-L*)
	  case $linkmode in
	  lib)
	    deplibs="$deplib $deplibs"
	    test conv = "$pass" && continue
	    newdependency_libs="$deplib $newdependency_libs"
	    func_stripname '-L' '' "$deplib"
	    func_resolve_sysroot "$func_stripname_result"
	    func_append newlib_search_path " $func_resolve_sysroot_result"
	    ;;
	  prog)
	    if test conv = "$pass"; then
	      deplibs="$deplib $deplibs"
	      continue
	    fi
	    if test scan = "$pass"; then
	      deplibs="$deplib $deplibs"
	    else
	      compile_deplibs="$deplib $compile_deplibs"
	      finalize_deplibs="$deplib $finalize_deplibs"
	    fi
	    func_stripname '-L' '' "$deplib"
	    func_resolve_sysroot "$func_stripname_result"
	    func_append newlib_search_path " $func_resolve_sysroot_result"
	    ;;
	  *)
	    func_warning "'-L' is ignored for archives/objects"
	    ;;
	  esac # linkmode
	  continue
	  ;; # -L
	-R*)
	  if test link = "$pass"; then
	    func_stripname '-R' '' "$deplib"
	    func_resolve_sysroot "$func_stripname_result"
	    dir=$func_resolve_sysroot_result
	    # Make sure the xrpath contains only unique directories.
	    case "$xrpath " in
	    *" $dir "*) ;;
	    *) func_append xrpath " $dir" ;;
	    esac
	  fi
	  deplibs="$deplib $deplibs"
	  continue
	  ;;
	*.la)
	  func_resolve_sysroot "$deplib"
	  lib=$func_resolve_sysroot_result
	  ;;
	*.$libext)
	  if test conv = "$pass"; then
	    deplibs="$deplib $deplibs"
	    continue
	  fi
	  case $linkmode in
	  lib)
	    # Linking convenience modules into shared libraries is allowed,
	    # but linking other static libraries is non-portable.
	    case " $dlpreconveniencelibs " in
	    *" $deplib "*) ;;
	    *)
	      valid_a_lib=false
	      case $deplibs_check_method in
		match_pattern*)
		  set dummy $deplibs_check_method; shift
		  match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
		  if eval "\$ECHO \"$deplib\"" 2>/dev/null | $SED 10q \
		    | $EGREP "$match_pattern_regex" > /dev/null; then
		    valid_a_lib=:
		  fi
		;;
		pass_all)
		  valid_a_lib=:
		;;
	      esac
	      if $valid_a_lib; then
		echo
		$ECHO "*** Warning: Linking the shared library $output against the"
		$ECHO "*** static library $deplib is not portable!"
		deplibs="$deplib $deplibs"
	      else
		echo
		$ECHO "*** Warning: Trying to link with static lib archive $deplib."
		echo "*** I have the capability to make that library automatically link in when"
		echo "*** you link to this library.  But I can only do this if you have a"
		echo "*** shared version of the library, which you do not appear to have"
		echo "*** because the file extensions .$libext of this argument makes me believe"
		echo "*** that it is just a static archive that I should not use here."
	      fi
	      ;;
	    esac
	    continue
	    ;;
	  prog)
	    if test link != "$pass"; then
	      deplibs="$deplib $deplibs"
	    else
	      compile_deplibs="$deplib $compile_deplibs"
	      finalize_deplibs="$deplib $finalize_deplibs"
	    fi
	    continue
	    ;;
	  esac # linkmode
	  ;; # *.$libext
	*.lo | *.$objext)
	  if test conv = "$pass"; then
	    deplibs="$deplib $deplibs"
	  elif test prog = "$linkmode"; then
	    if test dlpreopen = "$pass" || test yes != "$dlopen_support" || test no = "$build_libtool_libs"; then
	      # If there is no dlopen support or we're linking statically,
	      # we need to preload.
	      func_append newdlprefiles " $deplib"
	      compile_deplibs="$deplib $compile_deplibs"
	      finalize_deplibs="$deplib $finalize_deplibs"
	    else
	      func_append newdlfiles " $deplib"
	    fi
	  fi
	  continue
	  ;;
	%DEPLIBS%)
	  alldeplibs=:
	  continue
	  ;;
	esac # case $deplib

	$found || test -f "$lib" \
	  || func_fatal_error "cannot find the library '$lib' or unhandled argument '$deplib'"

	# Check to see that this really is a libtool archive.
	func_lalib_unsafe_p "$lib" \
	  || func_fatal_error "'$lib' is not a valid libtool archive"

	func_dirname "$lib" "" "."
	ladir=$func_dirname_result

	dlname=
	dlopen=
	dlpreopen=
	libdir=
	library_names=
	old_library=
	inherited_linker_flags=
	# If the library was installed with an old release of libtool,
	# it will not redefine variables installed, or shouldnotlink
	installed=yes
	shouldnotlink=no
	avoidtemprpath=


	# Read the .la file
	func_source "$lib"

	# Convert "-framework foo" to "foo.ltframework"
	if test -n "$inherited_linker_flags"; then
	  tmp_inherited_linker_flags=`$ECHO "$inherited_linker_flags" | $SED 's/-framework \([^ $]*\)/\1.ltframework/g'`
	  for tmp_inherited_linker_flag in $tmp_inherited_linker_flags; do
	    case " $new_inherited_linker_flags " in
	      *" $tmp_inherited_linker_flag "*) ;;
	      *) func_append new_inherited_linker_flags " $tmp_inherited_linker_flag";;
	    esac
	  done
	fi
	dependency_libs=`$ECHO " $dependency_libs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	if test lib,link = "$linkmode,$pass" ||
	   test prog,scan = "$linkmode,$pass" ||
	   { test prog != "$linkmode" && test lib != "$linkmode"; }; then
	  test -n "$dlopen" && func_append dlfiles " $dlopen"
	  test -n "$dlpreopen" && func_append dlprefiles " $dlpreopen"
	fi

	if test conv = "$pass"; then
	  # Only check for convenience libraries
	  deplibs="$lib $deplibs"
	  if test -z "$libdir"; then
	    if test -z "$old_library"; then
	      func_fatal_error "cannot find name of link library for '$lib'"
	    fi
	    # It is a libtool convenience library, so add in its objects.
	    func_append convenience " $ladir/$objdir/$old_library"
	    func_append old_convenience " $ladir/$objdir/$old_library"
	    tmp_libs=
	    for deplib in $dependency_libs; do
	      deplibs="$deplib $deplibs"
	      if $opt_preserve_dup_deps; then
		case "$tmp_libs " in
		*" $deplib "*) func_append specialdeplibs " $deplib" ;;
		esac
	      fi
	      func_append tmp_libs " $deplib"
	    done
	  elif test prog != "$linkmode" && test lib != "$linkmode"; then
	    func_fatal_error "'$lib' is not a convenience library"
	  fi
	  continue
	fi # $pass = conv


	# Get the name of the library we link against.
	linklib=
	if test -n "$old_library" &&
	   { test yes = "$prefer_static_libs" ||
	     test built,no = "$prefer_static_libs,$installed"; }; then
	  linklib=$old_library
	else
	  for l in $old_library $library_names; do
	    linklib=$l
	  done
	fi
	if test -z "$linklib"; then
	  func_fatal_error "cannot find name of link library for '$lib'"
	fi

	# This library was specified with -dlopen.
	if test dlopen = "$pass"; then
	  test -z "$libdir" \
	    && func_fatal_error "cannot -dlopen a convenience library: '$lib'"
	  if test -z "$dlname" ||
	     test yes != "$dlopen_support" ||
	     test no = "$build_libtool_libs"
	  then
	    # If there is no dlname, no dlopen support or we're linking
	    # statically, we need to preload.  We also need to preload any
	    # dependent libraries so libltdl's deplib preloader doesn't
	    # bomb out in the load deplibs phase.
	    func_append dlprefiles " $lib $dependency_libs"
	  else
	    func_append newdlfiles " $lib"
	  fi
	  continue
	fi # $pass = dlopen

	# We need an absolute path.
	case $ladir in
	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir=$ladir ;;
	*)
	  abs_ladir=`cd "$ladir" && pwd`
	  if test -z "$abs_ladir"; then
	    func_warning "cannot determine absolute directory name of '$ladir'"
	    func_warning "passing it literally to the linker, although it might fail"
	    abs_ladir=$ladir
	  fi
	  ;;
	esac
	func_basename "$lib"
	laname=$func_basename_result

	# Find the relevant object directory and library name.
	if test yes = "$installed"; then
	  if test ! -f "$lt_sysroot$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
	    func_warning "library '$lib' was moved."
	    dir=$ladir
	    absdir=$abs_ladir
	    libdir=$abs_ladir
	  else
	    dir=$lt_sysroot$libdir
	    absdir=$lt_sysroot$libdir
	  fi
	  test yes = "$hardcode_automatic" && avoidtemprpath=yes
	else
	  if test ! -f "$ladir/$objdir/$linklib" && test -f "$abs_ladir/$linklib"; then
	    dir=$ladir
	    absdir=$abs_ladir
	    # Remove this search path later
	    func_append notinst_path " $abs_ladir"
	  else
	    dir=$ladir/$objdir
	    absdir=$abs_ladir/$objdir
	    # Remove this search path later
	    func_append notinst_path " $abs_ladir"
	  fi
	fi # $installed = yes
	func_stripname 'lib' '.la' "$laname"
	name=$func_stripname_result

	# This library was specified with -dlpreopen.
	if test dlpreopen = "$pass"; then
	  if test -z "$libdir" && test prog = "$linkmode"; then
	    func_fatal_error "only libraries may -dlpreopen a convenience library: '$lib'"
	  fi
	  case $host in
	    # special handling for platforms with PE-DLLs.
	    *cygwin* | *mingw* | *cegcc* )
	      # Linker will automatically link against shared library if both
	      # static and shared are present.  Therefore, ensure we extract
	      # symbols from the import library if a shared library is present
	      # (otherwise, the dlopen module name will be incorrect).  We do
	      # this by putting the import library name into $newdlprefiles.
	      # We recover the dlopen module name by 'saving' the la file
	      # name in a special purpose variable, and (later) extracting the
	      # dlname from the la file.
	      if test -n "$dlname"; then
	        func_tr_sh "$dir/$linklib"
	        eval "libfile_$func_tr_sh_result=\$abs_ladir/\$laname"
	        func_append newdlprefiles " $dir/$linklib"
	      else
	        func_append newdlprefiles " $dir/$old_library"
	        # Keep a list of preopened convenience libraries to check
	        # that they are being used correctly in the link pass.
	        test -z "$libdir" && \
	          func_append dlpreconveniencelibs " $dir/$old_library"
	      fi
	    ;;
	    * )
	      # Prefer using a static library (so that no silly _DYNAMIC symbols
	      # are required to link).
	      if test -n "$old_library"; then
	        func_append newdlprefiles " $dir/$old_library"
	        # Keep a list of preopened convenience libraries to check
	        # that they are being used correctly in the link pass.
	        test -z "$libdir" && \
	          func_append dlpreconveniencelibs " $dir/$old_library"
	      # Otherwise, use the dlname, so that lt_dlopen finds it.
	      elif test -n "$dlname"; then
	        func_append newdlprefiles " $dir/$dlname"
	      else
	        func_append newdlprefiles " $dir/$linklib"
	      fi
	    ;;
	  esac
	fi # $pass = dlpreopen

	if test -z "$libdir"; then
	  # Link the convenience library
	  if test lib = "$linkmode"; then
	    deplibs="$dir/$old_library $deplibs"
	  elif test prog,link = "$linkmode,$pass"; then
	    compile_deplibs="$dir/$old_library $compile_deplibs"
	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
	  else
	    deplibs="$lib $deplibs" # used for prog,scan pass
	  fi
	  continue
	fi


	if test prog = "$linkmode" && test link != "$pass"; then
	  func_append newlib_search_path " $ladir"
	  deplibs="$lib $deplibs"

	  linkalldeplibs=false
	  if test no != "$link_all_deplibs" || test -z "$library_names" ||
	     test no = "$build_libtool_libs"; then
	    linkalldeplibs=:
	  fi

	  tmp_libs=
	  for deplib in $dependency_libs; do
	    case $deplib in
	    -L*) func_stripname '-L' '' "$deplib"
	         func_resolve_sysroot "$func_stripname_result"
	         func_append newlib_search_path " $func_resolve_sysroot_result"
		 ;;
	    esac
	    # Need to link against all dependency_libs?
	    if $linkalldeplibs; then
	      deplibs="$deplib $deplibs"
	    else
	      # Need to hardcode shared library paths
	      # or/and link against static libraries
	      newdependency_libs="$deplib $newdependency_libs"
	    fi
	    if $opt_preserve_dup_deps; then
	      case "$tmp_libs " in
	      *" $deplib "*) func_append specialdeplibs " $deplib" ;;
	      esac
	    fi
	    func_append tmp_libs " $deplib"
	  done # for deplib
	  continue
	fi # $linkmode = prog...

	if test prog,link = "$linkmode,$pass"; then
	  if test -n "$library_names" &&
	     { { test no = "$prefer_static_libs" ||
	         test built,yes = "$prefer_static_libs,$installed"; } ||
	       test -z "$old_library"; }; then
	    # We need to hardcode the library path
	    if test -n "$shlibpath_var" && test -z "$avoidtemprpath"; then
	      # Make sure the rpath contains only unique directories.
	      case $temp_rpath: in
	      *"$absdir:"*) ;;
	      *) func_append temp_rpath "$absdir:" ;;
	      esac
	    fi

	    # Hardcode the library path.
	    # Skip directories that are in the system default run-time
	    # search path.
	    case " $sys_lib_dlsearch_path " in
	    *" $absdir "*) ;;
	    *)
	      case "$compile_rpath " in
	      *" $absdir "*) ;;
	      *) func_append compile_rpath " $absdir" ;;
	      esac
	      ;;
	    esac
	    case " $sys_lib_dlsearch_path " in
	    *" $libdir "*) ;;
	    *)
	      case "$finalize_rpath " in
	      *" $libdir "*) ;;
	      *) func_append finalize_rpath " $libdir" ;;
	      esac
	      ;;
	    esac
	  fi # $linkmode,$pass = prog,link...

	  if $alldeplibs &&
	     { test pass_all = "$deplibs_check_method" ||
	       { test yes = "$build_libtool_libs" &&
		 test -n "$library_names"; }; }; then
	    # We only need to search for static libraries
	    continue
	  fi
	fi

	link_static=no # Whether the deplib will be linked statically
	use_static_libs=$prefer_static_libs
	if test built = "$use_static_libs" && test yes = "$installed"; then
	  use_static_libs=no
	fi
	if test -n "$library_names" &&
	   { test no = "$use_static_libs" || test -z "$old_library"; }; then
	  case $host in
	  *cygwin* | *mingw* | *cegcc* | *os2*)
	      # No point in relinking DLLs because paths are not encoded
	      func_append notinst_deplibs " $lib"
	      need_relink=no
	    ;;
	  *)
	    if test no = "$installed"; then
	      func_append notinst_deplibs " $lib"
	      need_relink=yes
	    fi
	    ;;
	  esac
	  # This is a shared library

	  # Warn about portability, can't link against -module's on some
	  # systems (darwin).  Don't bleat about dlopened modules though!
	  dlopenmodule=
	  for dlpremoduletest in $dlprefiles; do
	    if test "X$dlpremoduletest" = "X$lib"; then
	      dlopenmodule=$dlpremoduletest
	      break
	    fi
	  done
	  if test -z "$dlopenmodule" && test yes = "$shouldnotlink" && test link = "$pass"; then
	    echo
	    if test prog = "$linkmode"; then
	      $ECHO "*** Warning: Linking the executable $output against the loadable module"
	    else
	      $ECHO "*** Warning: Linking the shared library $output against the loadable module"
	    fi
	    $ECHO "*** $linklib is not portable!"
	  fi
	  if test lib = "$linkmode" &&
	     test yes = "$hardcode_into_libs"; then
	    # Hardcode the library path.
	    # Skip directories that are in the system default run-time
	    # search path.
	    case " $sys_lib_dlsearch_path " in
	    *" $absdir "*) ;;
	    *)
	      case "$compile_rpath " in
	      *" $absdir "*) ;;
	      *) func_append compile_rpath " $absdir" ;;
	      esac
	      ;;
	    esac
	    case " $sys_lib_dlsearch_path " in
	    *" $libdir "*) ;;
	    *)
	      case "$finalize_rpath " in
	      *" $libdir "*) ;;
	      *) func_append finalize_rpath " $libdir" ;;
	      esac
	      ;;
	    esac
	  fi

	  if test -n "$old_archive_from_expsyms_cmds"; then
	    # figure out the soname
	    set dummy $library_names
	    shift
	    realname=$1
	    shift
	    libname=`eval "\\$ECHO \"$libname_spec\""`
	    # use dlname if we got it. it's perfectly good, no?
	    if test -n "$dlname"; then
	      soname=$dlname
	    elif test -n "$soname_spec"; then
	      # bleh windows
	      case $host in
	      *cygwin* | mingw* | *cegcc* | *os2*)
	        func_arith $current - $age
		major=$func_arith_result
		versuffix=-$major
		;;
	      esac
	      eval soname=\"$soname_spec\"
	    else
	      soname=$realname
	    fi

	    # Make a new name for the extract_expsyms_cmds to use
	    soroot=$soname
	    func_basename "$soroot"
	    soname=$func_basename_result
	    func_stripname 'lib' '.dll' "$soname"
	    newlib=libimp-$func_stripname_result.a

	    # If the library has no export list, then create one now
	    if test -f "$output_objdir/$soname-def"; then :
	    else
	      func_verbose "extracting exported symbol list from '$soname'"
	      func_execute_cmds "$extract_expsyms_cmds" 'exit $?'
	    fi

	    # Create $newlib
	    if test -f "$output_objdir/$newlib"; then :; else
	      func_verbose "generating import library for '$soname'"
	      func_execute_cmds "$old_archive_from_expsyms_cmds" 'exit $?'
	    fi
	    # make sure the library variables are pointing to the new library
	    dir=$output_objdir
	    linklib=$newlib
	  fi # test -n "$old_archive_from_expsyms_cmds"

	  if test prog = "$linkmode" || test relink != "$opt_mode"; then
	    add_shlibpath=
	    add_dir=
	    add=
	    lib_linked=yes
	    case $hardcode_action in
	    immediate | unsupported)
	      if test no = "$hardcode_direct"; then
		add=$dir/$linklib
		case $host in
		  *-*-sco3.2v5.0.[024]*) add_dir=-L$dir ;;
		  *-*-sysv4*uw2*) add_dir=-L$dir ;;
		  *-*-sysv5OpenUNIX* | *-*-sysv5UnixWare7.[01].[10]* | \
		    *-*-unixware7*) add_dir=-L$dir ;;
		  *-*-darwin* )
		    # if the lib is a (non-dlopened) module then we cannot
		    # link against it, someone is ignoring the earlier warnings
		    if /usr/bin/file -L $add 2> /dev/null |
			 $GREP ": [^:]* bundle" >/dev/null; then
		      if test "X$dlopenmodule" != "X$lib"; then
			$ECHO "*** Warning: lib $linklib is a module, not a shared library"
			if test -z "$old_library"; then
			  echo
			  echo "*** And there doesn't seem to be a static archive available"
			  echo "*** The link will probably fail, sorry"
			else
			  add=$dir/$old_library
			fi
		      elif test -n "$old_library"; then
			add=$dir/$old_library
		      fi
		    fi
		esac
	      elif test no = "$hardcode_minus_L"; then
		case $host in
		*-*-sunos*) add_shlibpath=$dir ;;
		esac
		add_dir=-L$dir
		add=-l$name
	      elif test no = "$hardcode_shlibpath_var"; then
		add_shlibpath=$dir
		add=-l$name
	      else
		lib_linked=no
	      fi
	      ;;
	    relink)
	      if test yes = "$hardcode_direct" &&
	         test no = "$hardcode_direct_absolute"; then
		add=$dir/$linklib
	      elif test yes = "$hardcode_minus_L"; then
		add_dir=-L$absdir
		# Try looking first in the location we're being installed to.
		if test -n "$inst_prefix_dir"; then
		  case $libdir in
		    [\\/]*)
		      func_append add_dir " -L$inst_prefix_dir$libdir"
		      ;;
		  esac
		fi
		add=-l$name
	      elif test yes = "$hardcode_shlibpath_var"; then
		add_shlibpath=$dir
		add=-l$name
	      else
		lib_linked=no
	      fi
	      ;;
	    *) lib_linked=no ;;
	    esac

	    if test yes != "$lib_linked"; then
	      func_fatal_configuration "unsupported hardcode properties"
	    fi

	    if test -n "$add_shlibpath"; then
	      case :$compile_shlibpath: in
	      *":$add_shlibpath:"*) ;;
	      *) func_append compile_shlibpath "$add_shlibpath:" ;;
	      esac
	    fi
	    if test prog = "$linkmode"; then
	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
	    else
	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
	      test -n "$add" && deplibs="$add $deplibs"
	      if test yes != "$hardcode_direct" &&
		 test yes != "$hardcode_minus_L" &&
		 test yes = "$hardcode_shlibpath_var"; then
		case :$finalize_shlibpath: in
		*":$libdir:"*) ;;
		*) func_append finalize_shlibpath "$libdir:" ;;
		esac
	      fi
	    fi
	  fi

	  if test prog = "$linkmode" || test relink = "$opt_mode"; then
	    add_shlibpath=
	    add_dir=
	    add=
	    # Finalize command for both is simple: just hardcode it.
	    if test yes = "$hardcode_direct" &&
	       test no = "$hardcode_direct_absolute"; then
	      add=$libdir/$linklib
	    elif test yes = "$hardcode_minus_L"; then
	      add_dir=-L$libdir
	      add=-l$name
	    elif test yes = "$hardcode_shlibpath_var"; then
	      case :$finalize_shlibpath: in
	      *":$libdir:"*) ;;
	      *) func_append finalize_shlibpath "$libdir:" ;;
	      esac
	      add=-l$name
	    elif test yes = "$hardcode_automatic"; then
	      if test -n "$inst_prefix_dir" &&
		 test -f "$inst_prefix_dir$libdir/$linklib"; then
		add=$inst_prefix_dir$libdir/$linklib
	      else
		add=$libdir/$linklib
	      fi
	    else
	      # We cannot seem to hardcode it, guess we'll fake it.
	      add_dir=-L$libdir
	      # Try looking first in the location we're being installed to.
	      if test -n "$inst_prefix_dir"; then
		case $libdir in
		  [\\/]*)
		    func_append add_dir " -L$inst_prefix_dir$libdir"
		    ;;
		esac
	      fi
	      add=-l$name
	    fi

	    if test prog = "$linkmode"; then
	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
	    else
	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
	      test -n "$add" && deplibs="$add $deplibs"
	    fi
	  fi
	elif test prog = "$linkmode"; then
	  # Here we assume that one of hardcode_direct or hardcode_minus_L
	  # is not unsupported.  This is valid on all known static and
	  # shared platforms.
	  if test unsupported != "$hardcode_direct"; then
	    test -n "$old_library" && linklib=$old_library
	    compile_deplibs="$dir/$linklib $compile_deplibs"
	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
	  else
	    compile_deplibs="-l$name -L$dir $compile_deplibs"
	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
	  fi
	elif test yes = "$build_libtool_libs"; then
	  # Not a shared library
	  if test pass_all != "$deplibs_check_method"; then
	    # We're trying link a shared library against a static one
	    # but the system doesn't support it.

	    # Just print a warning and add the library to dependency_libs so
	    # that the program can be linked against the static library.
	    echo
	    $ECHO "*** Warning: This system cannot link to static lib archive $lib."
	    echo "*** I have the capability to make that library automatically link in when"
	    echo "*** you link to this library.  But I can only do this if you have a"
	    echo "*** shared version of the library, which you do not appear to have."
	    if test yes = "$module"; then
	      echo "*** But as you try to build a module library, libtool will still create "
	      echo "*** a static module, that should work as long as the dlopening application"
	      echo "*** is linked with the -dlopen flag to resolve symbols at runtime."
	      if test -z "$global_symbol_pipe"; then
		echo
		echo "*** However, this would only work if libtool was able to extract symbol"
		echo "*** lists from a program, using 'nm' or equivalent, but libtool could"
		echo "*** not find such a program.  So, this module is probably useless."
		echo "*** 'nm' from GNU binutils and a full rebuild may help."
	      fi
	      if test no = "$build_old_libs"; then
		build_libtool_libs=module
		build_old_libs=yes
	      else
		build_libtool_libs=no
	      fi
	    fi
	  else
	    deplibs="$dir/$old_library $deplibs"
	    link_static=yes
	  fi
	fi # link shared/static library?

	if test lib = "$linkmode"; then
	  if test -n "$dependency_libs" &&
	     { test yes != "$hardcode_into_libs" ||
	       test yes = "$build_old_libs" ||
	       test yes = "$link_static"; }; then
	    # Extract -R from dependency_libs
	    temp_deplibs=
	    for libdir in $dependency_libs; do
	      case $libdir in
	      -R*) func_stripname '-R' '' "$libdir"
	           temp_xrpath=$func_stripname_result
		   case " $xrpath " in
		   *" $temp_xrpath "*) ;;
		   *) func_append xrpath " $temp_xrpath";;
		   esac;;
	      *) func_append temp_deplibs " $libdir";;
	      esac
	    done
	    dependency_libs=$temp_deplibs
	  fi

	  func_append newlib_search_path " $absdir"
	  # Link against this library
	  test no = "$link_static" && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
	  # ... and its dependency_libs
	  tmp_libs=
	  for deplib in $dependency_libs; do
	    newdependency_libs="$deplib $newdependency_libs"
	    case $deplib in
              -L*) func_stripname '-L' '' "$deplib"
                   func_resolve_sysroot "$func_stripname_result";;
              *) func_resolve_sysroot "$deplib" ;;
            esac
	    if $opt_preserve_dup_deps; then
	      case "$tmp_libs " in
	      *" $func_resolve_sysroot_result "*)
                func_append specialdeplibs " $func_resolve_sysroot_result" ;;
	      esac
	    fi
	    func_append tmp_libs " $func_resolve_sysroot_result"
	  done

	  if test no != "$link_all_deplibs"; then
	    # Add the search paths of all dependency libraries
	    for deplib in $dependency_libs; do
	      path=
	      case $deplib in
	      -L*) path=$deplib ;;
	      *.la)
	        func_resolve_sysroot "$deplib"
	        deplib=$func_resolve_sysroot_result
	        func_dirname "$deplib" "" "."
		dir=$func_dirname_result
		# We need an absolute path.
		case $dir in
		[\\/]* | [A-Za-z]:[\\/]*) absdir=$dir ;;
		*)
		  absdir=`cd "$dir" && pwd`
		  if test -z "$absdir"; then
		    func_warning "cannot determine absolute directory name of '$dir'"
		    absdir=$dir
		  fi
		  ;;
		esac
		if $GREP "^installed=no" $deplib > /dev/null; then
		case $host in
		*-*-darwin*)
		  depdepl=
		  eval deplibrary_names=`$SED -n -e 's/^library_names=\(.*\)$/\1/p' $deplib`
		  if test -n "$deplibrary_names"; then
		    for tmp in $deplibrary_names; do
		      depdepl=$tmp
		    done
		    if test -f "$absdir/$objdir/$depdepl"; then
		      depdepl=$absdir/$objdir/$depdepl
		      darwin_install_name=`$OTOOL -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'`
                      if test -z "$darwin_install_name"; then
                          darwin_install_name=`$OTOOL64 -L $depdepl  | awk '{if (NR == 2) {print $1;exit}}'`
                      fi
		      func_append compiler_flags " $wl-dylib_file $wl$darwin_install_name:$depdepl"
		      func_append linker_flags " -dylib_file $darwin_install_name:$depdepl"
		      path=
		    fi
		  fi
		  ;;
		*)
		  path=-L$absdir/$objdir
		  ;;
		esac
		else
		  eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
		  test -z "$libdir" && \
		    func_fatal_error "'$deplib' is not a valid libtool archive"
		  test "$absdir" != "$libdir" && \
		    func_warning "'$deplib' seems to be moved"

		  path=-L$absdir
		fi
		;;
	      esac
	      case " $deplibs " in
	      *" $path "*) ;;
	      *) deplibs="$path $deplibs" ;;
	      esac
	    done
	  fi # link_all_deplibs != no
	fi # linkmode = lib
      done # for deplib in $libs
      if test link = "$pass"; then
	if test prog = "$linkmode"; then
	  compile_deplibs="$new_inherited_linker_flags $compile_deplibs"
	  finalize_deplibs="$new_inherited_linker_flags $finalize_deplibs"
	else
	  compiler_flags="$compiler_flags "`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	fi
      fi
      dependency_libs=$newdependency_libs
      if test dlpreopen = "$pass"; then
	# Link the dlpreopened libraries before other libraries
	for deplib in $save_deplibs; do
	  deplibs="$deplib $deplibs"
	done
      fi
      if test dlopen != "$pass"; then
	test conv = "$pass" || {
	  # Make sure lib_search_path contains only unique directories.
	  lib_search_path=
	  for dir in $newlib_search_path; do
	    case "$lib_search_path " in
	    *" $dir "*) ;;
	    *) func_append lib_search_path " $dir" ;;
	    esac
	  done
	  newlib_search_path=
	}

	if test prog,link = "$linkmode,$pass"; then
	  vars="compile_deplibs finalize_deplibs"
	else
	  vars=deplibs
	fi
	for var in $vars dependency_libs; do
	  # Add libraries to $var in reverse order
	  eval tmp_libs=\"\$$var\"
	  new_libs=
	  for deplib in $tmp_libs; do
	    # FIXME: Pedantically, this is the right thing to do, so
	    #        that some nasty dependency loop isn't accidentally
	    #        broken:
	    #new_libs="$deplib $new_libs"
	    # Pragmatically, this seems to cause very few problems in
	    # practice:
	    case $deplib in
	    -L*) new_libs="$deplib $new_libs" ;;
	    -R*) ;;
	    *)
	      # And here is the reason: when a library appears more
	      # than once as an explicit dependence of a library, or
	      # is implicitly linked in more than once by the
	      # compiler, it is considered special, and multiple
	      # occurrences thereof are not removed.  Compare this
	      # with having the same library being listed as a
	      # dependency of multiple other libraries: in this case,
	      # we know (pedantically, we assume) the library does not
	      # need to be listed more than once, so we keep only the
	      # last copy.  This is not always right, but it is rare
	      # enough that we require users that really mean to play
	      # such unportable linking tricks to link the library
	      # using -Wl,-lname, so that libtool does not consider it
	      # for duplicate removal.
	      case " $specialdeplibs " in
	      *" $deplib "*) new_libs="$deplib $new_libs" ;;
	      *)
		case " $new_libs " in
		*" $deplib "*) ;;
		*) new_libs="$deplib $new_libs" ;;
		esac
		;;
	      esac
	      ;;
	    esac
	  done
	  tmp_libs=
	  for deplib in $new_libs; do
	    case $deplib in
	    -L*)
	      case " $tmp_libs " in
	      *" $deplib "*) ;;
	      *) func_append tmp_libs " $deplib" ;;
	      esac
	      ;;
	    *) func_append tmp_libs " $deplib" ;;
	    esac
	  done
	  eval $var=\"$tmp_libs\"
	done # for var
      fi

      # Add Sun CC postdeps if required:
      test CXX = "$tagname" && {
        case $host_os in
        linux*)
          case `$CC -V 2>&1 | sed 5q` in
          *Sun\ C*) # Sun C++ 5.9
            func_suncc_cstd_abi

            if test no != "$suncc_use_cstd_abi"; then
              func_append postdeps ' -library=Cstd -library=Crun'
            fi
            ;;
          esac
          ;;

        solaris*)
          func_cc_basename "$CC"
          case $func_cc_basename_result in
          CC* | sunCC*)
            func_suncc_cstd_abi

            if test no != "$suncc_use_cstd_abi"; then
              func_append postdeps ' -library=Cstd -library=Crun'
            fi
            ;;
          esac
          ;;
        esac
      }

      # Last step: remove runtime libs from dependency_libs
      # (they stay in deplibs)
      tmp_libs=
      for i in $dependency_libs; do
	case " $predeps $postdeps $compiler_lib_search_path " in
	*" $i "*)
	  i=
	  ;;
	esac
	if test -n "$i"; then
	  func_append tmp_libs " $i"
	fi
      done
      dependency_libs=$tmp_libs
    done # for pass
    if test prog = "$linkmode"; then
      dlfiles=$newdlfiles
    fi
    if test prog = "$linkmode" || test lib = "$linkmode"; then
      dlprefiles=$newdlprefiles
    fi

    case $linkmode in
    oldlib)
      if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
	func_warning "'-dlopen' is ignored for archives"
      fi

      case " $deplibs" in
      *\ -l* | *\ -L*)
	func_warning "'-l' and '-L' are ignored for archives" ;;
      esac

      test -n "$rpath" && \
	func_warning "'-rpath' is ignored for archives"

      test -n "$xrpath" && \
	func_warning "'-R' is ignored for archives"

      test -n "$vinfo" && \
	func_warning "'-version-info/-version-number' is ignored for archives"

      test -n "$release" && \
	func_warning "'-release' is ignored for archives"

      test -n "$export_symbols$export_symbols_regex" && \
	func_warning "'-export-symbols' is ignored for archives"

      # Now set the variables for building old libraries.
      build_libtool_libs=no
      oldlibs=$output
      func_append objs "$old_deplibs"
      ;;

    lib)
      # Make sure we only generate libraries of the form 'libNAME.la'.
      case $outputname in
      lib*)
	func_stripname 'lib' '.la' "$outputname"
	name=$func_stripname_result
	eval shared_ext=\"$shrext_cmds\"
	eval libname=\"$libname_spec\"
	;;
      *)
	test no = "$module" \
	  && func_fatal_help "libtool library '$output' must begin with 'lib'"

	if test no != "$need_lib_prefix"; then
	  # Add the "lib" prefix for modules if required
	  func_stripname '' '.la' "$outputname"
	  name=$func_stripname_result
	  eval shared_ext=\"$shrext_cmds\"
	  eval libname=\"$libname_spec\"
	else
	  func_stripname '' '.la' "$outputname"
	  libname=$func_stripname_result
	fi
	;;
      esac

      if test -n "$objs"; then
	if test pass_all != "$deplibs_check_method"; then
	  func_fatal_error "cannot build libtool library '$output' from non-libtool objects on this host:$objs"
	else
	  echo
	  $ECHO "*** Warning: Linking the shared library $output against the non-libtool"
	  $ECHO "*** objects $objs is not portable!"
	  func_append libobjs " $objs"
	fi
      fi

      test no = "$dlself" \
	|| func_warning "'-dlopen self' is ignored for libtool libraries"

      set dummy $rpath
      shift
      test 1 -lt "$#" \
	&& func_warning "ignoring multiple '-rpath's for a libtool library"

      install_libdir=$1

      oldlibs=
      if test -z "$rpath"; then
	if test yes = "$build_libtool_libs"; then
	  # Building a libtool convenience library.
	  # Some compilers have problems with a '.al' extension so
	  # convenience libraries should have the same extension an
	  # archive normally would.
	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
	  build_libtool_libs=convenience
	  build_old_libs=yes
	fi

	test -n "$vinfo" && \
	  func_warning "'-version-info/-version-number' is ignored for convenience libraries"

	test -n "$release" && \
	  func_warning "'-release' is ignored for convenience libraries"
      else

	# Parse the version information argument.
	save_ifs=$IFS; IFS=:
	set dummy $vinfo 0 0 0
	shift
	IFS=$save_ifs

	test -n "$7" && \
	  func_fatal_help "too many parameters to '-version-info'"

	# convert absolute version numbers to libtool ages
	# this retains compatibility with .la files and attempts
	# to make the code below a bit more comprehensible

	case $vinfo_number in
	yes)
	  number_major=$1
	  number_minor=$2
	  number_revision=$3
	  #
	  # There are really only two kinds -- those that
	  # use the current revision as the major version
	  # and those that subtract age and use age as
	  # a minor version.  But, then there is irix
	  # that has an extra 1 added just for fun
	  #
	  case $version_type in
	  # correct linux to gnu/linux during the next big refactor
	  darwin|freebsd-elf|linux|osf|windows|none)
	    func_arith $number_major + $number_minor
	    current=$func_arith_result
	    age=$number_minor
	    revision=$number_revision
	    ;;
	  freebsd-aout|qnx|sunos)
	    current=$number_major
	    revision=$number_minor
	    age=0
	    ;;
	  irix|nonstopux)
	    func_arith $number_major + $number_minor
	    current=$func_arith_result
	    age=$number_minor
	    revision=$number_minor
	    lt_irix_increment=no
	    ;;
	  *)
	    func_fatal_configuration "$modename: unknown library version type '$version_type'"
	    ;;
	  esac
	  ;;
	no)
	  current=$1
	  revision=$2
	  age=$3
	  ;;
	esac

	# Check that each of the things are valid numbers.
	case $current in
	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
	*)
	  func_error "CURRENT '$current' must be a nonnegative integer"
	  func_fatal_error "'$vinfo' is not valid version information"
	  ;;
	esac

	case $revision in
	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
	*)
	  func_error "REVISION '$revision' must be a nonnegative integer"
	  func_fatal_error "'$vinfo' is not valid version information"
	  ;;
	esac

	case $age in
	0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;;
	*)
	  func_error "AGE '$age' must be a nonnegative integer"
	  func_fatal_error "'$vinfo' is not valid version information"
	  ;;
	esac

	if test "$age" -gt "$current"; then
	  func_error "AGE '$age' is greater than the current interface number '$current'"
	  func_fatal_error "'$vinfo' is not valid version information"
	fi

	# Calculate the version variables.
	major=
	versuffix=
	verstring=
	case $version_type in
	none) ;;

	darwin)
	  # Like Linux, but with the current version available in
	  # verstring for coding it into the library header
	  func_arith $current - $age
	  major=.$func_arith_result
	  versuffix=$major.$age.$revision
	  # Darwin ld doesn't like 0 for these options...
	  func_arith $current + 1
	  minor_current=$func_arith_result
	  xlcverstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision"
	  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
          # On Darwin other compilers
          case $CC in
              nagfor*)
                  verstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision"
                  ;;
              *)
                  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
                  ;;
          esac
	  ;;

	freebsd-aout)
	  major=.$current
	  versuffix=.$current.$revision
	  ;;

	freebsd-elf)
	  func_arith $current - $age
	  major=.$func_arith_result
	  versuffix=$major.$age.$revision
	  ;;

	irix | nonstopux)
	  if test no = "$lt_irix_increment"; then
	    func_arith $current - $age
	  else
	    func_arith $current - $age + 1
	  fi
	  major=$func_arith_result

	  case $version_type in
	    nonstopux) verstring_prefix=nonstopux ;;
	    *)         verstring_prefix=sgi ;;
	  esac
	  verstring=$verstring_prefix$major.$revision

	  # Add in all the interfaces that we are compatible with.
	  loop=$revision
	  while test 0 -ne "$loop"; do
	    func_arith $revision - $loop
	    iface=$func_arith_result
	    func_arith $loop - 1
	    loop=$func_arith_result
	    verstring=$verstring_prefix$major.$iface:$verstring
	  done

	  # Before this point, $major must not contain '.'.
	  major=.$major
	  versuffix=$major.$revision
	  ;;

	linux) # correct to gnu/linux during the next big refactor
	  func_arith $current - $age
	  major=.$func_arith_result
	  versuffix=$major.$age.$revision
	  ;;

	osf)
	  func_arith $current - $age
	  major=.$func_arith_result
	  versuffix=.$current.$age.$revision
	  verstring=$current.$age.$revision

	  # Add in all the interfaces that we are compatible with.
	  loop=$age
	  while test 0 -ne "$loop"; do
	    func_arith $current - $loop
	    iface=$func_arith_result
	    func_arith $loop - 1
	    loop=$func_arith_result
	    verstring=$verstring:$iface.0
	  done

	  # Make executables depend on our current version.
	  func_append verstring ":$current.0"
	  ;;

	qnx)
	  major=.$current
	  versuffix=.$current
	  ;;

	sco)
	  major=.$current
	  versuffix=.$current
	  ;;

	sunos)
	  major=.$current
	  versuffix=.$current.$revision
	  ;;

	windows)
	  # Use '-' rather than '.', since we only want one
	  # extension on DOS 8.3 file systems.
	  func_arith $current - $age
	  major=$func_arith_result
	  versuffix=-$major
	  ;;

	*)
	  func_fatal_configuration "unknown library version type '$version_type'"
	  ;;
	esac

	# Clear the version info if we defaulted, and they specified a release.
	if test -z "$vinfo" && test -n "$release"; then
	  major=
	  case $version_type in
	  darwin)
	    # we can't check for "0.0" in archive_cmds due to quoting
	    # problems, so we reset it completely
	    verstring=
	    ;;
	  *)
	    verstring=0.0
	    ;;
	  esac
	  if test no = "$need_version"; then
	    versuffix=
	  else
	    versuffix=.0.0
	  fi
	fi

	# Remove version info from name if versioning should be avoided
	if test yes,no = "$avoid_version,$need_version"; then
	  major=
	  versuffix=
	  verstring=
	fi

	# Check to see if the archive will have undefined symbols.
	if test yes = "$allow_undefined"; then
	  if test unsupported = "$allow_undefined_flag"; then
	    if test yes = "$build_old_libs"; then
	      func_warning "undefined symbols not allowed in $host shared libraries; building static only"
	      build_libtool_libs=no
	    else
	      func_fatal_error "can't build $host shared library unless -no-undefined is specified"
	    fi
	  fi
	else
	  # Don't allow undefined symbols.
	  allow_undefined_flag=$no_undefined_flag
	fi

      fi

      func_generate_dlsyms "$libname" "$libname" :
      func_append libobjs " $symfileobj"
      test " " = "$libobjs" && libobjs=

      if test relink != "$opt_mode"; then
	# Remove our outputs, but don't remove object files since they
	# may have been created when compiling PIC objects.
	removelist=
	tempremovelist=`$ECHO "$output_objdir/*"`
	for p in $tempremovelist; do
	  case $p in
	    *.$objext | *.gcno)
	       ;;
	    $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/$libname$release.*)
	       if test -n "$precious_files_regex"; then
		 if $ECHO "$p" | $EGREP -e "$precious_files_regex" >/dev/null 2>&1
		 then
		   continue
		 fi
	       fi
	       func_append removelist " $p"
	       ;;
	    *) ;;
	  esac
	done
	test -n "$removelist" && \
	  func_show_eval "${RM}r \$removelist"
      fi

      # Now set the variables for building old libraries.
      if test yes = "$build_old_libs" && test convenience != "$build_libtool_libs"; then
	func_append oldlibs " $output_objdir/$libname.$libext"

	# Transform .lo files to .o files.
	oldobjs="$objs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; $lo2o" | $NL2SP`
      fi

      # Eliminate all temporary directories.
      #for path in $notinst_path; do
      #	lib_search_path=`$ECHO "$lib_search_path " | $SED "s% $path % %g"`
      #	deplibs=`$ECHO "$deplibs " | $SED "s% -L$path % %g"`
      #	dependency_libs=`$ECHO "$dependency_libs " | $SED "s% -L$path % %g"`
      #done

      if test -n "$xrpath"; then
	# If the user specified any rpath flags, then add them.
	temp_xrpath=
	for libdir in $xrpath; do
	  func_replace_sysroot "$libdir"
	  func_append temp_xrpath " -R$func_replace_sysroot_result"
	  case "$finalize_rpath " in
	  *" $libdir "*) ;;
	  *) func_append finalize_rpath " $libdir" ;;
	  esac
	done
	if test yes != "$hardcode_into_libs" || test yes = "$build_old_libs"; then
	  dependency_libs="$temp_xrpath $dependency_libs"
	fi
      fi

      # Make sure dlfiles contains only unique files that won't be dlpreopened
      old_dlfiles=$dlfiles
      dlfiles=
      for lib in $old_dlfiles; do
	case " $dlprefiles $dlfiles " in
	*" $lib "*) ;;
	*) func_append dlfiles " $lib" ;;
	esac
      done

      # Make sure dlprefiles contains only unique files
      old_dlprefiles=$dlprefiles
      dlprefiles=
      for lib in $old_dlprefiles; do
	case "$dlprefiles " in
	*" $lib "*) ;;
	*) func_append dlprefiles " $lib" ;;
	esac
      done

      if test yes = "$build_libtool_libs"; then
	if test -n "$rpath"; then
	  case $host in
	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos* | *-cegcc* | *-*-haiku*)
	    # these systems don't actually have a c library (as such)!
	    ;;
	  *-*-rhapsody* | *-*-darwin1.[012])
	    # Rhapsody C library is in the System framework
	    func_append deplibs " System.ltframework"
	    ;;
	  *-*-netbsd*)
	    # Don't link with libc until the a.out ld.so is fixed.
	    ;;
	  *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*)
	    # Do not include libc due to us having libc/libc_r.
	    ;;
	  *-*-sco3.2v5* | *-*-sco5v6*)
	    # Causes problems with __ctype
	    ;;
	  *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*)
	    # Compiler inserts libc in the correct place for threads to work
	    ;;
	  *)
	    # Add libc to deplibs on all other systems if necessary.
	    if test yes = "$build_libtool_need_lc"; then
	      func_append deplibs " -lc"
	    fi
	    ;;
	  esac
	fi

	# Transform deplibs into only deplibs that can be linked in shared.
	name_save=$name
	libname_save=$libname
	release_save=$release
	versuffix_save=$versuffix
	major_save=$major
	# I'm not sure if I'm treating the release correctly.  I think
	# release should show up in the -l (ie -lgmp5) so we don't want to
	# add it in twice.  Is that correct?
	release=
	versuffix=
	major=
	newdeplibs=
	droppeddeps=no
	case $deplibs_check_method in
	pass_all)
	  # Don't check for shared/static.  Everything works.
	  # This might be a little naive.  We might want to check
	  # whether the library exists or not.  But this is on
	  # osf3 & osf4 and I'm not really sure... Just
	  # implementing what was already the behavior.
	  newdeplibs=$deplibs
	  ;;
	test_compile)
	  # This code stresses the "libraries are programs" paradigm to its
	  # limits. Maybe even breaks it.  We compile a program, linking it
	  # against the deplibs as a proxy for the library.  Then we can check
	  # whether they linked in statically or dynamically with ldd.
	  $opt_dry_run || $RM conftest.c
	  cat > conftest.c <<EOF
	  int main() { return 0; }
EOF
	  $opt_dry_run || $RM conftest
	  if $LTCC $LTCFLAGS -o conftest conftest.c $deplibs; then
	    ldd_output=`ldd conftest`
	    for i in $deplibs; do
	      case $i in
	      -l*)
		func_stripname -l '' "$i"
		name=$func_stripname_result
		if test yes = "$allow_libtool_libs_with_static_runtimes"; then
		  case " $predeps $postdeps " in
		  *" $i "*)
		    func_append newdeplibs " $i"
		    i=
		    ;;
		  esac
		fi
		if test -n "$i"; then
		  libname=`eval "\\$ECHO \"$libname_spec\""`
		  deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
		  set dummy $deplib_matches; shift
		  deplib_match=$1
		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0; then
		    func_append newdeplibs " $i"
		  else
		    droppeddeps=yes
		    echo
		    $ECHO "*** Warning: dynamic linker does not accept needed library $i."
		    echo "*** I have the capability to make that library automatically link in when"
		    echo "*** you link to this library.  But I can only do this if you have a"
		    echo "*** shared version of the library, which I believe you do not have"
		    echo "*** because a test_compile did reveal that the linker did not use it for"
		    echo "*** its dynamic dependency list that programs get resolved with at runtime."
		  fi
		fi
		;;
	      *)
		func_append newdeplibs " $i"
		;;
	      esac
	    done
	  else
	    # Error occurred in the first compile.  Let's try to salvage
	    # the situation: Compile a separate program for each library.
	    for i in $deplibs; do
	      case $i in
	      -l*)
		func_stripname -l '' "$i"
		name=$func_stripname_result
		$opt_dry_run || $RM conftest
		if $LTCC $LTCFLAGS -o conftest conftest.c $i; then
		  ldd_output=`ldd conftest`
		  if test yes = "$allow_libtool_libs_with_static_runtimes"; then
		    case " $predeps $postdeps " in
		    *" $i "*)
		      func_append newdeplibs " $i"
		      i=
		      ;;
		    esac
		  fi
		  if test -n "$i"; then
		    libname=`eval "\\$ECHO \"$libname_spec\""`
		    deplib_matches=`eval "\\$ECHO \"$library_names_spec\""`
		    set dummy $deplib_matches; shift
		    deplib_match=$1
		    if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0; then
		      func_append newdeplibs " $i"
		    else
		      droppeddeps=yes
		      echo
		      $ECHO "*** Warning: dynamic linker does not accept needed library $i."
		      echo "*** I have the capability to make that library automatically link in when"
		      echo "*** you link to this library.  But I can only do this if you have a"
		      echo "*** shared version of the library, which you do not appear to have"
		      echo "*** because a test_compile did reveal that the linker did not use this one"
		      echo "*** as a dynamic dependency that programs can get resolved with at runtime."
		    fi
		  fi
		else
		  droppeddeps=yes
		  echo
		  $ECHO "*** Warning!  Library $i is needed by this library but I was not able to"
		  echo "*** make it link in!  You will probably need to install it or some"
		  echo "*** library that it depends on before this library will be fully"
		  echo "*** functional.  Installing it before continuing would be even better."
		fi
		;;
	      *)
		func_append newdeplibs " $i"
		;;
	      esac
	    done
	  fi
	  ;;
	file_magic*)
	  set dummy $deplibs_check_method; shift
	  file_magic_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
	  for a_deplib in $deplibs; do
	    case $a_deplib in
	    -l*)
	      func_stripname -l '' "$a_deplib"
	      name=$func_stripname_result
	      if test yes = "$allow_libtool_libs_with_static_runtimes"; then
		case " $predeps $postdeps " in
		*" $a_deplib "*)
		  func_append newdeplibs " $a_deplib"
		  a_deplib=
		  ;;
		esac
	      fi
	      if test -n "$a_deplib"; then
		libname=`eval "\\$ECHO \"$libname_spec\""`
		if test -n "$file_magic_glob"; then
		  libnameglob=`func_echo_all "$libname" | $SED -e $file_magic_glob`
		else
		  libnameglob=$libname
		fi
		test yes = "$want_nocaseglob" && nocaseglob=`shopt -p nocaseglob`
		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
		  if test yes = "$want_nocaseglob"; then
		    shopt -s nocaseglob
		    potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null`
		    $nocaseglob
		  else
		    potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null`
		  fi
		  for potent_lib in $potential_libs; do
		      # Follow soft links.
		      if ls -lLd "$potent_lib" 2>/dev/null |
			 $GREP " -> " >/dev/null; then
			continue
		      fi
		      # The statement above tries to avoid entering an
		      # endless loop below, in case of cyclic links.
		      # We might still enter an endless loop, since a link
		      # loop can be closed while we follow links,
		      # but so what?
		      potlib=$potent_lib
		      while test -h "$potlib" 2>/dev/null; do
			potliblink=`ls -ld $potlib | $SED 's/.* -> //'`
			case $potliblink in
			[\\/]* | [A-Za-z]:[\\/]*) potlib=$potliblink;;
			*) potlib=`$ECHO "$potlib" | $SED 's|[^/]*$||'`"$potliblink";;
			esac
		      done
		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null |
			 $SED -e 10q |
			 $EGREP "$file_magic_regex" > /dev/null; then
			func_append newdeplibs " $a_deplib"
			a_deplib=
			break 2
		      fi
		  done
		done
	      fi
	      if test -n "$a_deplib"; then
		droppeddeps=yes
		echo
		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
		echo "*** I have the capability to make that library automatically link in when"
		echo "*** you link to this library.  But I can only do this if you have a"
		echo "*** shared version of the library, which you do not appear to have"
		echo "*** because I did check the linker path looking for a file starting"
		if test -z "$potlib"; then
		  $ECHO "*** with $libname but no candidates were found. (...for file magic test)"
		else
		  $ECHO "*** with $libname and none of the candidates passed a file format test"
		  $ECHO "*** using a file magic. Last file checked: $potlib"
		fi
	      fi
	      ;;
	    *)
	      # Add a -L argument.
	      func_append newdeplibs " $a_deplib"
	      ;;
	    esac
	  done # Gone through all deplibs.
	  ;;
	match_pattern*)
	  set dummy $deplibs_check_method; shift
	  match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"`
	  for a_deplib in $deplibs; do
	    case $a_deplib in
	    -l*)
	      func_stripname -l '' "$a_deplib"
	      name=$func_stripname_result
	      if test yes = "$allow_libtool_libs_with_static_runtimes"; then
		case " $predeps $postdeps " in
		*" $a_deplib "*)
		  func_append newdeplibs " $a_deplib"
		  a_deplib=
		  ;;
		esac
	      fi
	      if test -n "$a_deplib"; then
		libname=`eval "\\$ECHO \"$libname_spec\""`
		for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
		  potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
		  for potent_lib in $potential_libs; do
		    potlib=$potent_lib # see symlink-check above in file_magic test
		    if eval "\$ECHO \"$potent_lib\"" 2>/dev/null | $SED 10q | \
		       $EGREP "$match_pattern_regex" > /dev/null; then
		      func_append newdeplibs " $a_deplib"
		      a_deplib=
		      break 2
		    fi
		  done
		done
	      fi
	      if test -n "$a_deplib"; then
		droppeddeps=yes
		echo
		$ECHO "*** Warning: linker path does not have real file for library $a_deplib."
		echo "*** I have the capability to make that library automatically link in when"
		echo "*** you link to this library.  But I can only do this if you have a"
		echo "*** shared version of the library, which you do not appear to have"
		echo "*** because I did check the linker path looking for a file starting"
		if test -z "$potlib"; then
		  $ECHO "*** with $libname but no candidates were found. (...for regex pattern test)"
		else
		  $ECHO "*** with $libname and none of the candidates passed a file format test"
		  $ECHO "*** using a regex pattern. Last file checked: $potlib"
		fi
	      fi
	      ;;
	    *)
	      # Add a -L argument.
	      func_append newdeplibs " $a_deplib"
	      ;;
	    esac
	  done # Gone through all deplibs.
	  ;;
	none | unknown | *)
	  newdeplibs=
	  tmp_deplibs=`$ECHO " $deplibs" | $SED 's/ -lc$//; s/ -[LR][^ ]*//g'`
	  if test yes = "$allow_libtool_libs_with_static_runtimes"; then
	    for i in $predeps $postdeps; do
	      # can't use Xsed below, because $i might contain '/'
	      tmp_deplibs=`$ECHO " $tmp_deplibs" | $SED "s|$i||"`
	    done
	  fi
	  case $tmp_deplibs in
	  *[!\	\ ]*)
	    echo
	    if test none = "$deplibs_check_method"; then
	      echo "*** Warning: inter-library dependencies are not supported in this platform."
	    else
	      echo "*** Warning: inter-library dependencies are not known to be supported."
	    fi
	    echo "*** All declared inter-library dependencies are being dropped."
	    droppeddeps=yes
	    ;;
	  esac
	  ;;
	esac
	versuffix=$versuffix_save
	major=$major_save
	release=$release_save
	libname=$libname_save
	name=$name_save

	case $host in
	*-*-rhapsody* | *-*-darwin1.[012])
	  # On Rhapsody replace the C library with the System framework
	  newdeplibs=`$ECHO " $newdeplibs" | $SED 's/ -lc / System.ltframework /'`
	  ;;
	esac

	if test yes = "$droppeddeps"; then
	  if test yes = "$module"; then
	    echo
	    echo "*** Warning: libtool could not satisfy all declared inter-library"
	    $ECHO "*** dependencies of module $libname.  Therefore, libtool will create"
	    echo "*** a static module, that should work as long as the dlopening"
	    echo "*** application is linked with the -dlopen flag."
	    if test -z "$global_symbol_pipe"; then
	      echo
	      echo "*** However, this would only work if libtool was able to extract symbol"
	      echo "*** lists from a program, using 'nm' or equivalent, but libtool could"
	      echo "*** not find such a program.  So, this module is probably useless."
	      echo "*** 'nm' from GNU binutils and a full rebuild may help."
	    fi
	    if test no = "$build_old_libs"; then
	      oldlibs=$output_objdir/$libname.$libext
	      build_libtool_libs=module
	      build_old_libs=yes
	    else
	      build_libtool_libs=no
	    fi
	  else
	    echo "*** The inter-library dependencies that have been dropped here will be"
	    echo "*** automatically added whenever a program is linked with this library"
	    echo "*** or is declared to -dlopen it."

	    if test no = "$allow_undefined"; then
	      echo
	      echo "*** Since this library must not contain undefined symbols,"
	      echo "*** because either the platform does not support them or"
	      echo "*** it was explicitly requested with -no-undefined,"
	      echo "*** libtool will only create a static version of it."
	      if test no = "$build_old_libs"; then
		oldlibs=$output_objdir/$libname.$libext
		build_libtool_libs=module
		build_old_libs=yes
	      else
		build_libtool_libs=no
	      fi
	    fi
	  fi
	fi
	# Done checking deplibs!
	deplibs=$newdeplibs
      fi
      # Time to change all our "foo.ltframework" stuff back to "-framework foo"
      case $host in
	*-*-darwin*)
	  newdeplibs=`$ECHO " $newdeplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	  new_inherited_linker_flags=`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	  deplibs=`$ECHO " $deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	  ;;
      esac

      # move library search paths that coincide with paths to not yet
      # installed libraries to the beginning of the library search list
      new_libs=
      for path in $notinst_path; do
	case " $new_libs " in
	*" -L$path/$objdir "*) ;;
	*)
	  case " $deplibs " in
	  *" -L$path/$objdir "*)
	    func_append new_libs " -L$path/$objdir" ;;
	  esac
	  ;;
	esac
      done
      for deplib in $deplibs; do
	case $deplib in
	-L*)
	  case " $new_libs " in
	  *" $deplib "*) ;;
	  *) func_append new_libs " $deplib" ;;
	  esac
	  ;;
	*) func_append new_libs " $deplib" ;;
	esac
      done
      deplibs=$new_libs

      # All the library-specific variables (install_libdir is set above).
      library_names=
      old_library=
      dlname=

      # Test again, we may have decided not to build it any more
      if test yes = "$build_libtool_libs"; then
	# Remove $wl instances when linking with ld.
	# FIXME: should test the right _cmds variable.
	case $archive_cmds in
	  *\$LD\ *) wl= ;;
        esac
	if test yes = "$hardcode_into_libs"; then
	  # Hardcode the library paths
	  hardcode_libdirs=
	  dep_rpath=
	  rpath=$finalize_rpath
	  test relink = "$opt_mode" || rpath=$compile_rpath$rpath
	  for libdir in $rpath; do
	    if test -n "$hardcode_libdir_flag_spec"; then
	      if test -n "$hardcode_libdir_separator"; then
		func_replace_sysroot "$libdir"
		libdir=$func_replace_sysroot_result
		if test -z "$hardcode_libdirs"; then
		  hardcode_libdirs=$libdir
		else
		  # Just accumulate the unique libdirs.
		  case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
		  *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
		    ;;
		  *)
		    func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
		    ;;
		  esac
		fi
	      else
		eval flag=\"$hardcode_libdir_flag_spec\"
		func_append dep_rpath " $flag"
	      fi
	    elif test -n "$runpath_var"; then
	      case "$perm_rpath " in
	      *" $libdir "*) ;;
	      *) func_append perm_rpath " $libdir" ;;
	      esac
	    fi
	  done
	  # Substitute the hardcoded libdirs into the rpath.
	  if test -n "$hardcode_libdir_separator" &&
	     test -n "$hardcode_libdirs"; then
	    libdir=$hardcode_libdirs
	    eval "dep_rpath=\"$hardcode_libdir_flag_spec\""
	  fi
	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
	    # We should set the runpath_var.
	    rpath=
	    for dir in $perm_rpath; do
	      func_append rpath "$dir:"
	    done
	    eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
	  fi
	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
	fi

	shlibpath=$finalize_shlibpath
	test relink = "$opt_mode" || shlibpath=$compile_shlibpath$shlibpath
	if test -n "$shlibpath"; then
	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
	fi

	# Get the real and link names of the library.
	eval shared_ext=\"$shrext_cmds\"
	eval library_names=\"$library_names_spec\"
	set dummy $library_names
	shift
	realname=$1
	shift

	if test -n "$soname_spec"; then
	  eval soname=\"$soname_spec\"
	else
	  soname=$realname
	fi
	if test -z "$dlname"; then
	  dlname=$soname
	fi

	lib=$output_objdir/$realname
	linknames=
	for link
	do
	  func_append linknames " $link"
	done

	# Use standard objects if they are pic
	test -z "$pic_flag" && libobjs=`$ECHO "$libobjs" | $SP2NL | $SED "$lo2o" | $NL2SP`
	test "X$libobjs" = "X " && libobjs=

	delfiles=
	if test -n "$export_symbols" && test -n "$include_expsyms"; then
	  $opt_dry_run || cp "$export_symbols" "$output_objdir/$libname.uexp"
	  export_symbols=$output_objdir/$libname.uexp
	  func_append delfiles " $export_symbols"
	fi

	orig_export_symbols=
	case $host_os in
	cygwin* | mingw* | cegcc*)
	  if test -n "$export_symbols" && test -z "$export_symbols_regex"; then
	    # exporting using user supplied symfile
	    func_dll_def_p "$export_symbols" || {
	      # and it's NOT already a .def file. Must figure out
	      # which of the given symbols are data symbols and tag
	      # them as such. So, trigger use of export_symbols_cmds.
	      # export_symbols gets reassigned inside the "prepare
	      # the list of exported symbols" if statement, so the
	      # include_expsyms logic still works.
	      orig_export_symbols=$export_symbols
	      export_symbols=
	      always_export_symbols=yes
	    }
	  fi
	  ;;
	esac

	# Prepare the list of exported symbols
	if test -z "$export_symbols"; then
	  if test yes = "$always_export_symbols" || test -n "$export_symbols_regex"; then
	    func_verbose "generating symbol list for '$libname.la'"
	    export_symbols=$output_objdir/$libname.exp
	    $opt_dry_run || $RM $export_symbols
	    cmds=$export_symbols_cmds
	    save_ifs=$IFS; IFS='~'
	    for cmd1 in $cmds; do
	      IFS=$save_ifs
	      # Take the normal branch if the nm_file_list_spec branch
	      # doesn't work or if tool conversion is not needed.
	      case $nm_file_list_spec~$to_tool_file_cmd in
		*~func_convert_file_noop | *~func_convert_file_msys_to_w32 | ~*)
		  try_normal_branch=yes
		  eval cmd=\"$cmd1\"
		  func_len " $cmd"
		  len=$func_len_result
		  ;;
		*)
		  try_normal_branch=no
		  ;;
	      esac
	      if test yes = "$try_normal_branch" \
		 && { test "$len" -lt "$max_cmd_len" \
		      || test "$max_cmd_len" -le -1; }
	      then
		func_show_eval "$cmd" 'exit $?'
		skipped_export=false
	      elif test -n "$nm_file_list_spec"; then
		func_basename "$output"
		output_la=$func_basename_result
		save_libobjs=$libobjs
		save_output=$output
		output=$output_objdir/$output_la.nm
		func_to_tool_file "$output"
		libobjs=$nm_file_list_spec$func_to_tool_file_result
		func_append delfiles " $output"
		func_verbose "creating $NM input file list: $output"
		for obj in $save_libobjs; do
		  func_to_tool_file "$obj"
		  $ECHO "$func_to_tool_file_result"
		done > "$output"
		eval cmd=\"$cmd1\"
		func_show_eval "$cmd" 'exit $?'
		output=$save_output
		libobjs=$save_libobjs
		skipped_export=false
	      else
		# The command line is too long to execute in one step.
		func_verbose "using reloadable object file for export list..."
		skipped_export=:
		# Break out early, otherwise skipped_export may be
		# set to false by a later but shorter cmd.
		break
	      fi
	    done
	    IFS=$save_ifs
	    if test -n "$export_symbols_regex" && test : != "$skipped_export"; then
	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
	      func_show_eval '$MV "${export_symbols}T" "$export_symbols"'
	    fi
	  fi
	fi

	if test -n "$export_symbols" && test -n "$include_expsyms"; then
	  tmp_export_symbols=$export_symbols
	  test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols
	  $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
	fi

	if test : != "$skipped_export" && test -n "$orig_export_symbols"; then
	  # The given exports_symbols file has to be filtered, so filter it.
	  func_verbose "filter symbol list for '$libname.la' to tag DATA exports"
	  # FIXME: $output_objdir/$libname.filter potentially contains lots of
	  # 's' commands, which not all seds can handle. GNU sed should be fine
	  # though. Also, the filter scales superlinearly with the number of
	  # global variables. join(1) would be nice here, but unfortunately
	  # isn't a blessed tool.
	  $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter
	  func_append delfiles " $export_symbols $output_objdir/$libname.filter"
	  export_symbols=$output_objdir/$libname.def
	  $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols
	fi

	tmp_deplibs=
	for test_deplib in $deplibs; do
	  case " $convenience " in
	  *" $test_deplib "*) ;;
	  *)
	    func_append tmp_deplibs " $test_deplib"
	    ;;
	  esac
	done
	deplibs=$tmp_deplibs

	if test -n "$convenience"; then
	  if test -n "$whole_archive_flag_spec" &&
	    test yes = "$compiler_needs_object" &&
	    test -z "$libobjs"; then
	    # extract the archives, so we have objects to list.
	    # TODO: could optimize this to just extract one archive.
	    whole_archive_flag_spec=
	  fi
	  if test -n "$whole_archive_flag_spec"; then
	    save_libobjs=$libobjs
	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
	    test "X$libobjs" = "X " && libobjs=
	  else
	    gentop=$output_objdir/${outputname}x
	    func_append generated " $gentop"

	    func_extract_archives $gentop $convenience
	    func_append libobjs " $func_extract_archives_result"
	    test "X$libobjs" = "X " && libobjs=
	  fi
	fi

	if test yes = "$thread_safe" && test -n "$thread_safe_flag_spec"; then
	  eval flag=\"$thread_safe_flag_spec\"
	  func_append linker_flags " $flag"
	fi

	# Make a backup of the uninstalled library when relinking
	if test relink = "$opt_mode"; then
	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}U && $MV $realname ${realname}U)' || exit $?
	fi

	# Do each of the archive commands.
	if test yes = "$module" && test -n "$module_cmds"; then
	  if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
	    eval test_cmds=\"$module_expsym_cmds\"
	    cmds=$module_expsym_cmds
	  else
	    eval test_cmds=\"$module_cmds\"
	    cmds=$module_cmds
	  fi
	else
	  if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
	    eval test_cmds=\"$archive_expsym_cmds\"
	    cmds=$archive_expsym_cmds
	  else
	    eval test_cmds=\"$archive_cmds\"
	    cmds=$archive_cmds
	  fi
	fi

	if test : != "$skipped_export" &&
	   func_len " $test_cmds" &&
	   len=$func_len_result &&
	   test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then
	  :
	else
	  # The command line is too long to link in one step, link piecewise
	  # or, if using GNU ld and skipped_export is not :, use a linker
	  # script.

	  # Save the value of $output and $libobjs because we want to
	  # use them later.  If we have whole_archive_flag_spec, we
	  # want to use save_libobjs as it was before
	  # whole_archive_flag_spec was expanded, because we can't
	  # assume the linker understands whole_archive_flag_spec.
	  # This may have to be revisited, in case too many
	  # convenience libraries get linked in and end up exceeding
	  # the spec.
	  if test -z "$convenience" || test -z "$whole_archive_flag_spec"; then
	    save_libobjs=$libobjs
	  fi
	  save_output=$output
	  func_basename "$output"
	  output_la=$func_basename_result

	  # Clear the reloadable object creation command queue and
	  # initialize k to one.
	  test_cmds=
	  concat_cmds=
	  objlist=
	  last_robj=
	  k=1

	  if test -n "$save_libobjs" && test : != "$skipped_export" && test yes = "$with_gnu_ld"; then
	    output=$output_objdir/$output_la.lnkscript
	    func_verbose "creating GNU ld script: $output"
	    echo 'INPUT (' > $output
	    for obj in $save_libobjs
	    do
	      func_to_tool_file "$obj"
	      $ECHO "$func_to_tool_file_result" >> $output
	    done
	    echo ')' >> $output
	    func_append delfiles " $output"
	    func_to_tool_file "$output"
	    output=$func_to_tool_file_result
	  elif test -n "$save_libobjs" && test : != "$skipped_export" && test -n "$file_list_spec"; then
	    output=$output_objdir/$output_la.lnk
	    func_verbose "creating linker input file list: $output"
	    : > $output
	    set x $save_libobjs
	    shift
	    firstobj=
	    if test yes = "$compiler_needs_object"; then
	      firstobj="$1 "
	      shift
	    fi
	    for obj
	    do
	      func_to_tool_file "$obj"
	      $ECHO "$func_to_tool_file_result" >> $output
	    done
	    func_append delfiles " $output"
	    func_to_tool_file "$output"
	    output=$firstobj\"$file_list_spec$func_to_tool_file_result\"
	  else
	    if test -n "$save_libobjs"; then
	      func_verbose "creating reloadable object files..."
	      output=$output_objdir/$output_la-$k.$objext
	      eval test_cmds=\"$reload_cmds\"
	      func_len " $test_cmds"
	      len0=$func_len_result
	      len=$len0

	      # Loop over the list of objects to be linked.
	      for obj in $save_libobjs
	      do
		func_len " $obj"
		func_arith $len + $func_len_result
		len=$func_arith_result
		if test -z "$objlist" ||
		   test "$len" -lt "$max_cmd_len"; then
		  func_append objlist " $obj"
		else
		  # The command $test_cmds is almost too long, add a
		  # command to the queue.
		  if test 1 -eq "$k"; then
		    # The first file doesn't have a previous command to add.
		    reload_objs=$objlist
		    eval concat_cmds=\"$reload_cmds\"
		  else
		    # All subsequent reloadable object files will link in
		    # the last one created.
		    reload_objs="$objlist $last_robj"
		    eval concat_cmds=\"\$concat_cmds~$reload_cmds~\$RM $last_robj\"
		  fi
		  last_robj=$output_objdir/$output_la-$k.$objext
		  func_arith $k + 1
		  k=$func_arith_result
		  output=$output_objdir/$output_la-$k.$objext
		  objlist=" $obj"
		  func_len " $last_robj"
		  func_arith $len0 + $func_len_result
		  len=$func_arith_result
		fi
	      done
	      # Handle the remaining objects by creating one last
	      # reloadable object file.  All subsequent reloadable object
	      # files will link in the last one created.
	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
	      reload_objs="$objlist $last_robj"
	      eval concat_cmds=\"\$concat_cmds$reload_cmds\"
	      if test -n "$last_robj"; then
	        eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\"
	      fi
	      func_append delfiles " $output"

	    else
	      output=
	    fi

	    ${skipped_export-false} && {
	      func_verbose "generating symbol list for '$libname.la'"
	      export_symbols=$output_objdir/$libname.exp
	      $opt_dry_run || $RM $export_symbols
	      libobjs=$output
	      # Append the command to create the export file.
	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
	      eval concat_cmds=\"\$concat_cmds$export_symbols_cmds\"
	      if test -n "$last_robj"; then
		eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\"
	      fi
	    }

	    test -n "$save_libobjs" &&
	      func_verbose "creating a temporary reloadable object file: $output"

	    # Loop through the commands generated above and execute them.
	    save_ifs=$IFS; IFS='~'
	    for cmd in $concat_cmds; do
	      IFS=$save_ifs
	      $opt_quiet || {
		  func_quote_for_expand "$cmd"
		  eval "func_echo $func_quote_for_expand_result"
	      }
	      $opt_dry_run || eval "$cmd" || {
		lt_exit=$?

		# Restore the uninstalled library and exit
		if test relink = "$opt_mode"; then
		  ( cd "$output_objdir" && \
		    $RM "${realname}T" && \
		    $MV "${realname}U" "$realname" )
		fi

		exit $lt_exit
	      }
	    done
	    IFS=$save_ifs

	    if test -n "$export_symbols_regex" && ${skipped_export-false}; then
	      func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
	      func_show_eval '$MV "${export_symbols}T" "$export_symbols"'
	    fi
	  fi

          ${skipped_export-false} && {
	    if test -n "$export_symbols" && test -n "$include_expsyms"; then
	      tmp_export_symbols=$export_symbols
	      test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols
	      $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"'
	    fi

	    if test -n "$orig_export_symbols"; then
	      # The given exports_symbols file has to be filtered, so filter it.
	      func_verbose "filter symbol list for '$libname.la' to tag DATA exports"
	      # FIXME: $output_objdir/$libname.filter potentially contains lots of
	      # 's' commands, which not all seds can handle. GNU sed should be fine
	      # though. Also, the filter scales superlinearly with the number of
	      # global variables. join(1) would be nice here, but unfortunately
	      # isn't a blessed tool.
	      $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter
	      func_append delfiles " $export_symbols $output_objdir/$libname.filter"
	      export_symbols=$output_objdir/$libname.def
	      $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols
	    fi
	  }

	  libobjs=$output
	  # Restore the value of output.
	  output=$save_output

	  if test -n "$convenience" && test -n "$whole_archive_flag_spec"; then
	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
	    test "X$libobjs" = "X " && libobjs=
	  fi
	  # Expand the library linking commands again to reset the
	  # value of $libobjs for piecewise linking.

	  # Do each of the archive commands.
	  if test yes = "$module" && test -n "$module_cmds"; then
	    if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then
	      cmds=$module_expsym_cmds
	    else
	      cmds=$module_cmds
	    fi
	  else
	    if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
	      cmds=$archive_expsym_cmds
	    else
	      cmds=$archive_cmds
	    fi
	  fi
	fi

	if test -n "$delfiles"; then
	  # Append the command to remove temporary files to $cmds.
	  eval cmds=\"\$cmds~\$RM $delfiles\"
	fi

	# Add any objects from preloaded convenience libraries
	if test -n "$dlprefiles"; then
	  gentop=$output_objdir/${outputname}x
	  func_append generated " $gentop"

	  func_extract_archives $gentop $dlprefiles
	  func_append libobjs " $func_extract_archives_result"
	  test "X$libobjs" = "X " && libobjs=
	fi

	save_ifs=$IFS; IFS='~'
	for cmd in $cmds; do
	  IFS=$sp$nl
	  eval cmd=\"$cmd\"
	  IFS=$save_ifs
	  $opt_quiet || {
	    func_quote_for_expand "$cmd"
	    eval "func_echo $func_quote_for_expand_result"
	  }
	  $opt_dry_run || eval "$cmd" || {
	    lt_exit=$?

	    # Restore the uninstalled library and exit
	    if test relink = "$opt_mode"; then
	      ( cd "$output_objdir" && \
	        $RM "${realname}T" && \
		$MV "${realname}U" "$realname" )
	    fi

	    exit $lt_exit
	  }
	done
	IFS=$save_ifs

	# Restore the uninstalled library and exit
	if test relink = "$opt_mode"; then
	  $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}T && $MV $realname ${realname}T && $MV ${realname}U $realname)' || exit $?

	  if test -n "$convenience"; then
	    if test -z "$whole_archive_flag_spec"; then
	      func_show_eval '${RM}r "$gentop"'
	    fi
	  fi

	  exit $EXIT_SUCCESS
	fi

	# Create links to the real library.
	for linkname in $linknames; do
	  if test "$realname" != "$linkname"; then
	    func_show_eval '(cd "$output_objdir" && $RM "$linkname" && $LN_S "$realname" "$linkname")' 'exit $?'
	  fi
	done

	# If -module or -export-dynamic was specified, set the dlname.
	if test yes = "$module" || test yes = "$export_dynamic"; then
	  # On all known operating systems, these are identical.
	  dlname=$soname
	fi
      fi
      ;;

    obj)
      if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then
	func_warning "'-dlopen' is ignored for objects"
      fi

      case " $deplibs" in
      *\ -l* | *\ -L*)
	func_warning "'-l' and '-L' are ignored for objects" ;;
      esac

      test -n "$rpath" && \
	func_warning "'-rpath' is ignored for objects"

      test -n "$xrpath" && \
	func_warning "'-R' is ignored for objects"

      test -n "$vinfo" && \
	func_warning "'-version-info' is ignored for objects"

      test -n "$release" && \
	func_warning "'-release' is ignored for objects"

      case $output in
      *.lo)
	test -n "$objs$old_deplibs" && \
	  func_fatal_error "cannot build library object '$output' from non-libtool objects"

	libobj=$output
	func_lo2o "$libobj"
	obj=$func_lo2o_result
	;;
      *)
	libobj=
	obj=$output
	;;
      esac

      # Delete the old objects.
      $opt_dry_run || $RM $obj $libobj

      # Objects from convenience libraries.  This assumes
      # single-version convenience libraries.  Whenever we create
      # different ones for PIC/non-PIC, this we'll have to duplicate
      # the extraction.
      reload_conv_objs=
      gentop=
      # if reload_cmds runs $LD directly, get rid of -Wl from
      # whole_archive_flag_spec and hope we can get by with turning comma
      # into space.
      case $reload_cmds in
        *\$LD[\ \$]*) wl= ;;
      esac
      if test -n "$convenience"; then
	if test -n "$whole_archive_flag_spec"; then
	  eval tmp_whole_archive_flags=\"$whole_archive_flag_spec\"
	  test -n "$wl" || tmp_whole_archive_flags=`$ECHO "$tmp_whole_archive_flags" | $SED 's|,| |g'`
	  reload_conv_objs=$reload_objs\ $tmp_whole_archive_flags
	else
	  gentop=$output_objdir/${obj}x
	  func_append generated " $gentop"

	  func_extract_archives $gentop $convenience
	  reload_conv_objs="$reload_objs $func_extract_archives_result"
	fi
      fi

      # If we're not building shared, we need to use non_pic_objs
      test yes = "$build_libtool_libs" || libobjs=$non_pic_objects

      # Create the old-style object.
      reload_objs=$objs$old_deplibs' '`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; /\.lib$/d; $lo2o" | $NL2SP`' '$reload_conv_objs

      output=$obj
      func_execute_cmds "$reload_cmds" 'exit $?'

      # Exit if we aren't doing a library object file.
      if test -z "$libobj"; then
	if test -n "$gentop"; then
	  func_show_eval '${RM}r "$gentop"'
	fi

	exit $EXIT_SUCCESS
      fi

      test yes = "$build_libtool_libs" || {
	if test -n "$gentop"; then
	  func_show_eval '${RM}r "$gentop"'
	fi

	# Create an invalid libtool object if no PIC, so that we don't
	# accidentally link it into a program.
	# $show "echo timestamp > $libobj"
	# $opt_dry_run || eval "echo timestamp > $libobj" || exit $?
	exit $EXIT_SUCCESS
      }

      if test -n "$pic_flag" || test default != "$pic_mode"; then
	# Only do commands if we really have different PIC objects.
	reload_objs="$libobjs $reload_conv_objs"
	output=$libobj
	func_execute_cmds "$reload_cmds" 'exit $?'
      fi

      if test -n "$gentop"; then
	func_show_eval '${RM}r "$gentop"'
      fi

      exit $EXIT_SUCCESS
      ;;

    prog)
      case $host in
	*cygwin*) func_stripname '' '.exe' "$output"
	          output=$func_stripname_result.exe;;
      esac
      test -n "$vinfo" && \
	func_warning "'-version-info' is ignored for programs"

      test -n "$release" && \
	func_warning "'-release' is ignored for programs"

      $preload \
	&& test unknown,unknown,unknown = "$dlopen_support,$dlopen_self,$dlopen_self_static" \
	&& func_warning "'LT_INIT([dlopen])' not used. Assuming no dlopen support."

      case $host in
      *-*-rhapsody* | *-*-darwin1.[012])
	# On Rhapsody replace the C library is the System framework
	compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's/ -lc / System.ltframework /'`
	finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's/ -lc / System.ltframework /'`
	;;
      esac

      case $host in
      *-*-darwin*)
	# Don't allow lazy linking, it breaks C++ global constructors
	# But is supposedly fixed on 10.4 or later (yay!).
	if test CXX = "$tagname"; then
	  case ${MACOSX_DEPLOYMENT_TARGET-10.0} in
	    10.[0123])
	      func_append compile_command " $wl-bind_at_load"
	      func_append finalize_command " $wl-bind_at_load"
	    ;;
	  esac
	fi
	# Time to change all our "foo.ltframework" stuff back to "-framework foo"
	compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'`
	;;
      esac


      # move library search paths that coincide with paths to not yet
      # installed libraries to the beginning of the library search list
      new_libs=
      for path in $notinst_path; do
	case " $new_libs " in
	*" -L$path/$objdir "*) ;;
	*)
	  case " $compile_deplibs " in
	  *" -L$path/$objdir "*)
	    func_append new_libs " -L$path/$objdir" ;;
	  esac
	  ;;
	esac
      done
      for deplib in $compile_deplibs; do
	case $deplib in
	-L*)
	  case " $new_libs " in
	  *" $deplib "*) ;;
	  *) func_append new_libs " $deplib" ;;
	  esac
	  ;;
	*) func_append new_libs " $deplib" ;;
	esac
      done
      compile_deplibs=$new_libs


      func_append compile_command " $compile_deplibs"
      func_append finalize_command " $finalize_deplibs"

      if test -n "$rpath$xrpath"; then
	# If the user specified any rpath flags, then add them.
	for libdir in $rpath $xrpath; do
	  # This is the magic to use -rpath.
	  case "$finalize_rpath " in
	  *" $libdir "*) ;;
	  *) func_append finalize_rpath " $libdir" ;;
	  esac
	done
      fi

      # Now hardcode the library paths
      rpath=
      hardcode_libdirs=
      for libdir in $compile_rpath $finalize_rpath; do
	if test -n "$hardcode_libdir_flag_spec"; then
	  if test -n "$hardcode_libdir_separator"; then
	    if test -z "$hardcode_libdirs"; then
	      hardcode_libdirs=$libdir
	    else
	      # Just accumulate the unique libdirs.
	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
		;;
	      *)
		func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
		;;
	      esac
	    fi
	  else
	    eval flag=\"$hardcode_libdir_flag_spec\"
	    func_append rpath " $flag"
	  fi
	elif test -n "$runpath_var"; then
	  case "$perm_rpath " in
	  *" $libdir "*) ;;
	  *) func_append perm_rpath " $libdir" ;;
	  esac
	fi
	case $host in
	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*)
	  testbindir=`$ECHO "$libdir" | $SED -e 's*/lib$*/bin*'`
	  case :$dllsearchpath: in
	  *":$libdir:"*) ;;
	  ::) dllsearchpath=$libdir;;
	  *) func_append dllsearchpath ":$libdir";;
	  esac
	  case :$dllsearchpath: in
	  *":$testbindir:"*) ;;
	  ::) dllsearchpath=$testbindir;;
	  *) func_append dllsearchpath ":$testbindir";;
	  esac
	  ;;
	esac
      done
      # Substitute the hardcoded libdirs into the rpath.
      if test -n "$hardcode_libdir_separator" &&
	 test -n "$hardcode_libdirs"; then
	libdir=$hardcode_libdirs
	eval rpath=\" $hardcode_libdir_flag_spec\"
      fi
      compile_rpath=$rpath

      rpath=
      hardcode_libdirs=
      for libdir in $finalize_rpath; do
	if test -n "$hardcode_libdir_flag_spec"; then
	  if test -n "$hardcode_libdir_separator"; then
	    if test -z "$hardcode_libdirs"; then
	      hardcode_libdirs=$libdir
	    else
	      # Just accumulate the unique libdirs.
	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
		;;
	      *)
		func_append hardcode_libdirs "$hardcode_libdir_separator$libdir"
		;;
	      esac
	    fi
	  else
	    eval flag=\"$hardcode_libdir_flag_spec\"
	    func_append rpath " $flag"
	  fi
	elif test -n "$runpath_var"; then
	  case "$finalize_perm_rpath " in
	  *" $libdir "*) ;;
	  *) func_append finalize_perm_rpath " $libdir" ;;
	  esac
	fi
      done
      # Substitute the hardcoded libdirs into the rpath.
      if test -n "$hardcode_libdir_separator" &&
	 test -n "$hardcode_libdirs"; then
	libdir=$hardcode_libdirs
	eval rpath=\" $hardcode_libdir_flag_spec\"
      fi
      finalize_rpath=$rpath

      if test -n "$libobjs" && test yes = "$build_old_libs"; then
	# Transform all the library objects into standard objects.
	compile_command=`$ECHO "$compile_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
	finalize_command=`$ECHO "$finalize_command" | $SP2NL | $SED "$lo2o" | $NL2SP`
      fi

      func_generate_dlsyms "$outputname" "@PROGRAM@" false

      # template prelinking step
      if test -n "$prelink_cmds"; then
	func_execute_cmds "$prelink_cmds" 'exit $?'
      fi

      wrappers_required=:
      case $host in
      *cegcc* | *mingw32ce*)
        # Disable wrappers for cegcc and mingw32ce hosts, we are cross compiling anyway.
        wrappers_required=false
        ;;
      *cygwin* | *mingw* )
        test yes = "$build_libtool_libs" || wrappers_required=false
        ;;
      *)
        if test no = "$need_relink" || test yes != "$build_libtool_libs"; then
          wrappers_required=false
        fi
        ;;
      esac
      $wrappers_required || {
	# Replace the output file specification.
	compile_command=`$ECHO "$compile_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
	link_command=$compile_command$compile_rpath

	# We have no uninstalled library dependencies, so finalize right now.
	exit_status=0
	func_show_eval "$link_command" 'exit_status=$?'

	if test -n "$postlink_cmds"; then
	  func_to_tool_file "$output"
	  postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
	  func_execute_cmds "$postlink_cmds" 'exit $?'
	fi

	# Delete the generated files.
	if test -f "$output_objdir/${outputname}S.$objext"; then
	  func_show_eval '$RM "$output_objdir/${outputname}S.$objext"'
	fi

	exit $exit_status
      }

      if test -n "$compile_shlibpath$finalize_shlibpath"; then
	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
      fi
      if test -n "$finalize_shlibpath"; then
	finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
      fi

      compile_var=
      finalize_var=
      if test -n "$runpath_var"; then
	if test -n "$perm_rpath"; then
	  # We should set the runpath_var.
	  rpath=
	  for dir in $perm_rpath; do
	    func_append rpath "$dir:"
	  done
	  compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
	fi
	if test -n "$finalize_perm_rpath"; then
	  # We should set the runpath_var.
	  rpath=
	  for dir in $finalize_perm_rpath; do
	    func_append rpath "$dir:"
	  done
	  finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
	fi
      fi

      if test yes = "$no_install"; then
	# We don't need to create a wrapper script.
	link_command=$compile_var$compile_command$compile_rpath
	# Replace the output file specification.
	link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output"'%g'`
	# Delete the old output file.
	$opt_dry_run || $RM $output
	# Link the executable and exit
	func_show_eval "$link_command" 'exit $?'

	if test -n "$postlink_cmds"; then
	  func_to_tool_file "$output"
	  postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
	  func_execute_cmds "$postlink_cmds" 'exit $?'
	fi

	exit $EXIT_SUCCESS
      fi

      case $hardcode_action,$fast_install in
        relink,*)
	  # Fast installation is not supported
	  link_command=$compile_var$compile_command$compile_rpath
	  relink_command=$finalize_var$finalize_command$finalize_rpath

	  func_warning "this platform does not like uninstalled shared libraries"
	  func_warning "'$output' will be relinked during installation"
	  ;;
        *,yes)
	  link_command=$finalize_var$compile_command$finalize_rpath
	  relink_command=`$ECHO "$compile_var$compile_command$compile_rpath" | $SED 's%@OUTPUT@%\$progdir/\$file%g'`
          ;;
	*,no)
	  link_command=$compile_var$compile_command$compile_rpath
	  relink_command=$finalize_var$finalize_command$finalize_rpath
          ;;
	*,needless)
	  link_command=$finalize_var$compile_command$finalize_rpath
	  relink_command=
          ;;
      esac

      # Replace the output file specification.
      link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`

      # Delete the old output files.
      $opt_dry_run || $RM $output $output_objdir/$outputname $output_objdir/lt-$outputname

      func_show_eval "$link_command" 'exit $?'

      if test -n "$postlink_cmds"; then
	func_to_tool_file "$output_objdir/$outputname"
	postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'`
	func_execute_cmds "$postlink_cmds" 'exit $?'
      fi

      # Now create the wrapper script.
      func_verbose "creating $output"

      # Quote the relink command for shipping.
      if test -n "$relink_command"; then
	# Preserve any variables that may affect compiler behavior
	for var in $variables_saved_for_relink; do
	  if eval test -z \"\${$var+set}\"; then
	    relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command"
	  elif eval var_value=\$$var; test -z "$var_value"; then
	    relink_command="$var=; export $var; $relink_command"
	  else
	    func_quote_for_eval "$var_value"
	    relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command"
	  fi
	done
	relink_command="(cd `pwd`; $relink_command)"
	relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"`
      fi

      # Only actually do things if not in dry run mode.
      $opt_dry_run || {
	# win32 will think the script is a binary if it has
	# a .exe suffix, so we strip it off here.
	case $output in
	  *.exe) func_stripname '' '.exe' "$output"
	         output=$func_stripname_result ;;
	esac
	# test for cygwin because mv fails w/o .exe extensions
	case $host in
	  *cygwin*)
	    exeext=.exe
	    func_stripname '' '.exe' "$outputname"
	    outputname=$func_stripname_result ;;
	  *) exeext= ;;
	esac
	case $host in
	  *cygwin* | *mingw* )
	    func_dirname_and_basename "$output" "" "."
	    output_name=$func_basename_result
	    output_path=$func_dirname_result
	    cwrappersource=$output_path/$objdir/lt-$output_name.c
	    cwrapper=$output_path/$output_name.exe
	    $RM $cwrappersource $cwrapper
	    trap "$RM $cwrappersource $cwrapper; exit $EXIT_FAILURE" 1 2 15

	    func_emit_cwrapperexe_src > $cwrappersource

	    # The wrapper executable is built using the $host compiler,
	    # because it contains $host paths and files. If cross-
	    # compiling, it, like the target executable, must be
	    # executed on the $host or under an emulation environment.
	    $opt_dry_run || {
	      $LTCC $LTCFLAGS -o $cwrapper $cwrappersource
	      $STRIP $cwrapper
	    }

	    # Now, create the wrapper script for func_source use:
	    func_ltwrapper_scriptname $cwrapper
	    $RM $func_ltwrapper_scriptname_result
	    trap "$RM $func_ltwrapper_scriptname_result; exit $EXIT_FAILURE" 1 2 15
	    $opt_dry_run || {
	      # note: this script will not be executed, so do not chmod.
	      if test "x$build" = "x$host"; then
		$cwrapper --lt-dump-script > $func_ltwrapper_scriptname_result
	      else
		func_emit_wrapper no > $func_ltwrapper_scriptname_result
	      fi
	    }
	  ;;
	  * )
	    $RM $output
	    trap "$RM $output; exit $EXIT_FAILURE" 1 2 15

	    func_emit_wrapper no > $output
	    chmod +x $output
	  ;;
	esac
      }
      exit $EXIT_SUCCESS
      ;;
    esac

    # See if we need to build an old-fashioned archive.
    for oldlib in $oldlibs; do

      case $build_libtool_libs in
        convenience)
	  oldobjs="$libobjs_save $symfileobj"
	  addlibs=$convenience
	  build_libtool_libs=no
	  ;;
	module)
	  oldobjs=$libobjs_save
	  addlibs=$old_convenience
	  build_libtool_libs=no
          ;;
	*)
	  oldobjs="$old_deplibs $non_pic_objects"
	  $preload && test -f "$symfileobj" \
	    && func_append oldobjs " $symfileobj"
	  addlibs=$old_convenience
	  ;;
      esac

      if test -n "$addlibs"; then
	gentop=$output_objdir/${outputname}x
	func_append generated " $gentop"

	func_extract_archives $gentop $addlibs
	func_append oldobjs " $func_extract_archives_result"
      fi

      # Do each command in the archive commands.
      if test -n "$old_archive_from_new_cmds" && test yes = "$build_libtool_libs"; then
	cmds=$old_archive_from_new_cmds
      else

	# Add any objects from preloaded convenience libraries
	if test -n "$dlprefiles"; then
	  gentop=$output_objdir/${outputname}x
	  func_append generated " $gentop"

	  func_extract_archives $gentop $dlprefiles
	  func_append oldobjs " $func_extract_archives_result"
	fi

	# POSIX demands no paths to be encoded in archives.  We have
	# to avoid creating archives with duplicate basenames if we
	# might have to extract them afterwards, e.g., when creating a
	# static archive out of a convenience library, or when linking
	# the entirety of a libtool archive into another (currently
	# not supported by libtool).
	if (for obj in $oldobjs
	    do
	      func_basename "$obj"
	      $ECHO "$func_basename_result"
	    done | sort | sort -uc >/dev/null 2>&1); then
	  :
	else
	  echo "copying selected object files to avoid basename conflicts..."
	  gentop=$output_objdir/${outputname}x
	  func_append generated " $gentop"
	  func_mkdir_p "$gentop"
	  save_oldobjs=$oldobjs
	  oldobjs=
	  counter=1
	  for obj in $save_oldobjs
	  do
	    func_basename "$obj"
	    objbase=$func_basename_result
	    case " $oldobjs " in
	    " ") oldobjs=$obj ;;
	    *[\ /]"$objbase "*)
	      while :; do
		# Make sure we don't pick an alternate name that also
		# overlaps.
		newobj=lt$counter-$objbase
		func_arith $counter + 1
		counter=$func_arith_result
		case " $oldobjs " in
		*[\ /]"$newobj "*) ;;
		*) if test ! -f "$gentop/$newobj"; then break; fi ;;
		esac
	      done
	      func_show_eval "ln $obj $gentop/$newobj || cp $obj $gentop/$newobj"
	      func_append oldobjs " $gentop/$newobj"
	      ;;
	    *) func_append oldobjs " $obj" ;;
	    esac
	  done
	fi
	func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
	tool_oldlib=$func_to_tool_file_result
	eval cmds=\"$old_archive_cmds\"

	func_len " $cmds"
	len=$func_len_result
	if test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then
	  cmds=$old_archive_cmds
	elif test -n "$archiver_list_spec"; then
	  func_verbose "using command file archive linking..."
	  for obj in $oldobjs
	  do
	    func_to_tool_file "$obj"
	    $ECHO "$func_to_tool_file_result"
	  done > $output_objdir/$libname.libcmd
	  func_to_tool_file "$output_objdir/$libname.libcmd"
	  oldobjs=" $archiver_list_spec$func_to_tool_file_result"
	  cmds=$old_archive_cmds
	else
	  # the command line is too long to link in one step, link in parts
	  func_verbose "using piecewise archive linking..."
	  save_RANLIB=$RANLIB
	  RANLIB=:
	  objlist=
	  concat_cmds=
	  save_oldobjs=$oldobjs
	  oldobjs=
	  # Is there a better way of finding the last object in the list?
	  for obj in $save_oldobjs
	  do
	    last_oldobj=$obj
	  done
	  eval test_cmds=\"$old_archive_cmds\"
	  func_len " $test_cmds"
	  len0=$func_len_result
	  len=$len0
	  for obj in $save_oldobjs
	  do
	    func_len " $obj"
	    func_arith $len + $func_len_result
	    len=$func_arith_result
	    func_append objlist " $obj"
	    if test "$len" -lt "$max_cmd_len"; then
	      :
	    else
	      # the above command should be used before it gets too long
	      oldobjs=$objlist
	      if test "$obj" = "$last_oldobj"; then
		RANLIB=$save_RANLIB
	      fi
	      test -z "$concat_cmds" || concat_cmds=$concat_cmds~
	      eval concat_cmds=\"\$concat_cmds$old_archive_cmds\"
	      objlist=
	      len=$len0
	    fi
	  done
	  RANLIB=$save_RANLIB
	  oldobjs=$objlist
	  if test -z "$oldobjs"; then
	    eval cmds=\"\$concat_cmds\"
	  else
	    eval cmds=\"\$concat_cmds~\$old_archive_cmds\"
	  fi
	fi
      fi
      func_execute_cmds "$cmds" 'exit $?'
    done

    test -n "$generated" && \
      func_show_eval "${RM}r$generated"

    # Now create the libtool archive.
    case $output in
    *.la)
      old_library=
      test yes = "$build_old_libs" && old_library=$libname.$libext
      func_verbose "creating $output"

      # Preserve any variables that may affect compiler behavior
      for var in $variables_saved_for_relink; do
	if eval test -z \"\${$var+set}\"; then
	  relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command"
	elif eval var_value=\$$var; test -z "$var_value"; then
	  relink_command="$var=; export $var; $relink_command"
	else
	  func_quote_for_eval "$var_value"
	  relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command"
	fi
      done
      # Quote the link command for shipping.
      relink_command="(cd `pwd`; $SHELL \"$progpath\" $preserve_args --mode=relink $libtool_args @inst_prefix_dir@)"
      relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"`
      if test yes = "$hardcode_automatic"; then
	relink_command=
      fi

      # Only create the output if not a dry run.
      $opt_dry_run || {
	for installed in no yes; do
	  if test yes = "$installed"; then
	    if test -z "$install_libdir"; then
	      break
	    fi
	    output=$output_objdir/${outputname}i
	    # Replace all uninstalled libtool libraries with the installed ones
	    newdependency_libs=
	    for deplib in $dependency_libs; do
	      case $deplib in
	      *.la)
		func_basename "$deplib"
		name=$func_basename_result
		func_resolve_sysroot "$deplib"
		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result`
		test -z "$libdir" && \
		  func_fatal_error "'$deplib' is not a valid libtool archive"
		func_append newdependency_libs " ${lt_sysroot:+=}$libdir/$name"
		;;
	      -L*)
		func_stripname -L '' "$deplib"
		func_replace_sysroot "$func_stripname_result"
		func_append newdependency_libs " -L$func_replace_sysroot_result"
		;;
	      -R*)
		func_stripname -R '' "$deplib"
		func_replace_sysroot "$func_stripname_result"
		func_append newdependency_libs " -R$func_replace_sysroot_result"
		;;
	      *) func_append newdependency_libs " $deplib" ;;
	      esac
	    done
	    dependency_libs=$newdependency_libs
	    newdlfiles=

	    for lib in $dlfiles; do
	      case $lib in
	      *.la)
	        func_basename "$lib"
		name=$func_basename_result
		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
		test -z "$libdir" && \
		  func_fatal_error "'$lib' is not a valid libtool archive"
		func_append newdlfiles " ${lt_sysroot:+=}$libdir/$name"
		;;
	      *) func_append newdlfiles " $lib" ;;
	      esac
	    done
	    dlfiles=$newdlfiles
	    newdlprefiles=
	    for lib in $dlprefiles; do
	      case $lib in
	      *.la)
		# Only pass preopened files to the pseudo-archive (for
		# eventual linking with the app. that links it) if we
		# didn't already link the preopened objects directly into
		# the library:
		func_basename "$lib"
		name=$func_basename_result
		eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
		test -z "$libdir" && \
		  func_fatal_error "'$lib' is not a valid libtool archive"
		func_append newdlprefiles " ${lt_sysroot:+=}$libdir/$name"
		;;
	      esac
	    done
	    dlprefiles=$newdlprefiles
	  else
	    newdlfiles=
	    for lib in $dlfiles; do
	      case $lib in
		[\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;;
		*) abs=`pwd`"/$lib" ;;
	      esac
	      func_append newdlfiles " $abs"
	    done
	    dlfiles=$newdlfiles
	    newdlprefiles=
	    for lib in $dlprefiles; do
	      case $lib in
		[\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;;
		*) abs=`pwd`"/$lib" ;;
	      esac
	      func_append newdlprefiles " $abs"
	    done
	    dlprefiles=$newdlprefiles
	  fi
	  $RM $output
	  # place dlname in correct position for cygwin
	  # In fact, it would be nice if we could use this code for all target
	  # systems that can't hard-code library paths into their executables
	  # and that have no shared library path variable independent of PATH,
	  # but it turns out we can't easily determine that from inspecting
	  # libtool variables, so we have to hard-code the OSs to which it
	  # applies here; at the moment, that means platforms that use the PE
	  # object format with DLL files.  See the long comment at the top of
	  # tests/bindir.at for full details.
	  tdlname=$dlname
	  case $host,$output,$installed,$module,$dlname in
	    *cygwin*,*lai,yes,no,*.dll | *mingw*,*lai,yes,no,*.dll | *cegcc*,*lai,yes,no,*.dll)
	      # If a -bindir argument was supplied, place the dll there.
	      if test -n "$bindir"; then
		func_relative_path "$install_libdir" "$bindir"
		tdlname=$func_relative_path_result/$dlname
	      else
		# Otherwise fall back on heuristic.
		tdlname=../bin/$dlname
	      fi
	      ;;
	  esac
	  $ECHO > $output "\
# $outputname - a libtool library file
# Generated by $PROGRAM (GNU $PACKAGE) $VERSION
#
# Please DO NOT delete this file!
# It is necessary for linking the library.

# The name that we can dlopen(3).
dlname='$tdlname'

# Names of this library.
library_names='$library_names'

# The name of the static archive.
old_library='$old_library'

# Linker flags that cannot go in dependency_libs.
inherited_linker_flags='$new_inherited_linker_flags'

# Libraries that this one depends upon.
dependency_libs='$dependency_libs'

# Names of additional weak libraries provided by this library
weak_library_names='$weak_libs'

# Version information for $libname.
current=$current
age=$age
revision=$revision

# Is this an already installed library?
installed=$installed

# Should we warn about portability when linking against -modules?
shouldnotlink=$module

# Files to dlopen/dlpreopen
dlopen='$dlfiles'
dlpreopen='$dlprefiles'

# Directory that this library needs to be installed in:
libdir='$install_libdir'"
	  if test no,yes = "$installed,$need_relink"; then
	    $ECHO >> $output "\
relink_command=\"$relink_command\""
	  fi
	done
      }

      # Do a symbolic link so that the libtool archive can be found in
      # LD_LIBRARY_PATH before the program is installed.
      func_show_eval '( cd "$output_objdir" && $RM "$outputname" && $LN_S "../$outputname" "$outputname" )' 'exit $?'
      ;;
    esac
    exit $EXIT_SUCCESS
}

if test link = "$opt_mode" || test relink = "$opt_mode"; then
  func_mode_link ${1+"$@"}
fi


# func_mode_uninstall arg...
func_mode_uninstall ()
{
    $debug_cmd

    RM=$nonopt
    files=
    rmforce=false
    exit_status=0

    # This variable tells wrapper scripts just to set variables rather
    # than running their programs.
    libtool_install_magic=$magic

    for arg
    do
      case $arg in
      -f) func_append RM " $arg"; rmforce=: ;;
      -*) func_append RM " $arg" ;;
      *) func_append files " $arg" ;;
      esac
    done

    test -z "$RM" && \
      func_fatal_help "you must specify an RM program"

    rmdirs=

    for file in $files; do
      func_dirname "$file" "" "."
      dir=$func_dirname_result
      if test . = "$dir"; then
	odir=$objdir
      else
	odir=$dir/$objdir
      fi
      func_basename "$file"
      name=$func_basename_result
      test uninstall = "$opt_mode" && odir=$dir

      # Remember odir for removal later, being careful to avoid duplicates
      if test clean = "$opt_mode"; then
	case " $rmdirs " in
	  *" $odir "*) ;;
	  *) func_append rmdirs " $odir" ;;
	esac
      fi

      # Don't error if the file doesn't exist and rm -f was used.
      if { test -L "$file"; } >/dev/null 2>&1 ||
	 { test -h "$file"; } >/dev/null 2>&1 ||
	 test -f "$file"; then
	:
      elif test -d "$file"; then
	exit_status=1
	continue
      elif $rmforce; then
	continue
      fi

      rmfiles=$file

      case $name in
      *.la)
	# Possibly a libtool archive, so verify it.
	if func_lalib_p "$file"; then
	  func_source $dir/$name

	  # Delete the libtool libraries and symlinks.
	  for n in $library_names; do
	    func_append rmfiles " $odir/$n"
	  done
	  test -n "$old_library" && func_append rmfiles " $odir/$old_library"

	  case $opt_mode in
	  clean)
	    case " $library_names " in
	    *" $dlname "*) ;;
	    *) test -n "$dlname" && func_append rmfiles " $odir/$dlname" ;;
	    esac
	    test -n "$libdir" && func_append rmfiles " $odir/$name $odir/${name}i"
	    ;;
	  uninstall)
	    if test -n "$library_names"; then
	      # Do each command in the postuninstall commands.
	      func_execute_cmds "$postuninstall_cmds" '$rmforce || exit_status=1'
	    fi

	    if test -n "$old_library"; then
	      # Do each command in the old_postuninstall commands.
	      func_execute_cmds "$old_postuninstall_cmds" '$rmforce || exit_status=1'
	    fi
	    # FIXME: should reinstall the best remaining shared library.
	    ;;
	  esac
	fi
	;;

      *.lo)
	# Possibly a libtool object, so verify it.
	if func_lalib_p "$file"; then

	  # Read the .lo file
	  func_source $dir/$name

	  # Add PIC object to the list of files to remove.
	  if test -n "$pic_object" && test none != "$pic_object"; then
	    func_append rmfiles " $dir/$pic_object"
	  fi

	  # Add non-PIC object to the list of files to remove.
	  if test -n "$non_pic_object" && test none != "$non_pic_object"; then
	    func_append rmfiles " $dir/$non_pic_object"
	  fi
	fi
	;;

      *)
	if test clean = "$opt_mode"; then
	  noexename=$name
	  case $file in
	  *.exe)
	    func_stripname '' '.exe' "$file"
	    file=$func_stripname_result
	    func_stripname '' '.exe' "$name"
	    noexename=$func_stripname_result
	    # $file with .exe has already been added to rmfiles,
	    # add $file without .exe
	    func_append rmfiles " $file"
	    ;;
	  esac
	  # Do a test to see if this is a libtool program.
	  if func_ltwrapper_p "$file"; then
	    if func_ltwrapper_executable_p "$file"; then
	      func_ltwrapper_scriptname "$file"
	      relink_command=
	      func_source $func_ltwrapper_scriptname_result
	      func_append rmfiles " $func_ltwrapper_scriptname_result"
	    else
	      relink_command=
	      func_source $dir/$noexename
	    fi

	    # note $name still contains .exe if it was in $file originally
	    # as does the version of $file that was added into $rmfiles
	    func_append rmfiles " $odir/$name $odir/${name}S.$objext"
	    if test yes = "$fast_install" && test -n "$relink_command"; then
	      func_append rmfiles " $odir/lt-$name"
	    fi
	    if test "X$noexename" != "X$name"; then
	      func_append rmfiles " $odir/lt-$noexename.c"
	    fi
	  fi
	fi
	;;
      esac
      func_show_eval "$RM $rmfiles" 'exit_status=1'
    done

    # Try to remove the $objdir's in the directories where we deleted files
    for dir in $rmdirs; do
      if test -d "$dir"; then
	func_show_eval "rmdir $dir >/dev/null 2>&1"
      fi
    done

    exit $exit_status
}

if test uninstall = "$opt_mode" || test clean = "$opt_mode"; then
  func_mode_uninstall ${1+"$@"}
fi

test -z "$opt_mode" && {
  help=$generic_help
  func_fatal_help "you must specify a MODE"
}

test -z "$exec_cmd" && \
  func_fatal_help "invalid operation mode '$opt_mode'"

if test -n "$exec_cmd"; then
  eval exec "$exec_cmd"
  exit $EXIT_FAILURE
fi

exit $exit_status


# The TAGs below are defined such that we never get into a situation
# where we disable both kinds of libraries.  Given conflicting
# choices, we go for a static library, that is the most portable,
# since we can't tell whether shared libraries were disabled because
# the user asked for that or because the platform doesn't support
# them.  This is particularly important on AIX, because we don't
# support having both static and shared libraries enabled at the same
# time on that platform, so we default to a shared-only configuration.
# If a disable-shared tag is given, we'll fallback to a static-only
# configuration.  But we'll never go from static-only to shared-only.

# ### BEGIN LIBTOOL TAG CONFIG: disable-shared
build_libtool_libs=no
build_old_libs=yes
# ### END LIBTOOL TAG CONFIG: disable-shared

# ### BEGIN LIBTOOL TAG CONFIG: disable-static
build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac`
# ### END LIBTOOL TAG CONFIG: disable-static

# Local Variables:
# mode:shell-script
# sh-indentation:2
# End:


================================================
FILE: requirements.txt
================================================
decorator>=4.3.0
joblib>=0.14.1
numpy>=1.18.2
pandas>=1.0.3
scipy>=1.4.1
sklearn>=0.0
sympy>=1.4
xgboost>=0.81


================================================
FILE: src/ChangeLog
================================================
version: 0.08.3
date: Wed Nov 13 11:39:01 CET 2019
changes:
	- support recent versions of clang
	- fix OpenMP support when contraction is enabled
---
version: 0.08.2
date: Thu Mar 28 18:36:52 CET 2019
changes:
	- support recent versions of clang
---
version: 0.08.1
date: Mon Jul 30 23:05:04 CEST 2018
changes:
	- move some functionality to isl
---
version: 0.08
date: Sat Mar  3 15:31:38 CET 2018
changes:
	- minor fixes
---
version: 0.07
date: Tue Feb  7 17:23:22 CET 2017
changes:
	- support hybrid tiling
---
version: 0.06
date: Fri May  6 12:08:50 CEST 2016
changes:
	- use PPCG specific macro names in generated code
	- complete transition to schedule trees
	- maximize coincidence by default
	- map arrays with constant index expressions to private memory
	- optionally group chains of statements
---
version: 0.05
date: Fri Jan 15 09:30:23 CET 2016
changes:
	- fix live-out computation
	- optionally compute schedule for C target
	- optionally perform tiling for C target
	- create single kernel for non-permutable subtree
---
version: 0.04
date: Wed Jun 17 10:52:58 CEST 2015
changes:
	- use schedule trees
	- fix live-range reordering
	- improve generation of synchronization
	- exploit independences during dependence analysis


================================================
FILE: src/LICENSE
================================================
MIT License (MIT)

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: src/Makefile.am
================================================
if BUNDLED_ISL
    MAYBE_ISL = isl
    ISL_LA = $(top_builddir)/isl/libisl.la
    LOCAL_ISL_LA = isl/libisl.la
endif
if BUNDLED_BARVINOK
    MAYBE_BARVINOK = barvinok
    BARVINOK_LA = $(top_builddir)/barvinok/libbarvinok.la
endif 
if BUNDLED_PET
    MAYBE_PET = pet
    PET_LA = $(top_builddir)/pet/libpet.la
endif

SUBDIRS = $(MAYBE_ISL) $(MAYBE_BARVINOK) $(MAYBE_PET) .

FORCE:
isl/libisl.la: FORCE
	cd isl; $(MAKE) $(AM_MAKEFLAGS) libisl.la
barvinok/libbarvinok.la: FORCE
	cd barvinok; $(MAKE) $(AM_MAKEFLAGS) libbarvinok.la
pet/libpet.la: FORCE
	cd pet; $(MAKE) $(AM_MAKEFLAGS) libpet.la

ACLOCAL_AMFLAGS = -I m4

LIB_ISL = $(ISL_LA) @ISL_LIBS@
LIB_BARVINOK = $(BARVINOK_LA) @BARVINOK_LIBS@
LIB_PET = $(PET_LA) @PET_LIBS@

AM_CPPFLAGS = @ISL_CFLAGS@ @BARVINOK_CFLAGS@ @PET_CFLAGS@
LDADD = $(LIB_PET) $(LIB_ISL) $(LIB_BARVINOK)
AM_CXXFLAGS = -std=c++11
bin_PROGRAMS = autosa
autosa_SOURCES = \
	cpu.c \
	cpu.h \
	grouping.c \
	grouping.h \
	hybrid.c \
	hybrid.h \
	schedule.c \
	schedule.h \
	ppcg_options.c \
	ppcg_options.h \
	ppcg.c \
	ppcg.h \
	print.c \
	print.h \
	util.c \
	util.h \
	main.cpp \
	cJSON/cJSON.c \
	autosa_codegen.cpp \
	autosa_comm.cpp \
	autosa_common.cpp \
	autosa_cpu.cpp \
	autosa_intel_opencl.cpp \
	autosa_print.cpp \
	autosa_schedule_tree.cpp \
	autosa_t2s.cpp \
	autosa_trans.cpp \
	autosa_utils.cpp \
	autosa_xilinx_hls_c.cpp  \
	autosa_catapult_hls_c.cpp \
	autosa_tapa_cpp.cpp \
	autosa_tuning.cpp \
	json.hpp

#TESTS = @extra_tests@
#EXTRA_TESTS = opencl_test.sh polybench_test.sh
#TEST_EXTENSIONS = .sh

#BUILT_SOURCES = gitversion.h

#CLEANFILES = gitversion.h

#EXTRA_DIST = \
#	examples \
#	ocl_utilities.c \
#	ocl_utilities.h \
#	tests

#dist-hook:
#	echo @GIT_HEAD_VERSION@ > $(distdir)/GIT_HEAD_ID
#
#gitversion.h: @GIT_HEAD@
#	$(AM_V_GEN)echo '#define GIT_HEAD_ID "'@GIT_HEAD_VERSION@'"' > $@
#
#cpu.c \
#cpu.h \
#cuda.c \
#cuda.h \
#opencl.c \
#opencl.h \
#cuda_common.h \
#cuda_common.c \
#gpu.c \
#gpu.h \
#gpu_array_tile.c \
#gpu_array_tile.h \
#gpu_group.c \
#gpu_group.h \
#gpu_hybrid.c \
#gpu_hybrid.h \
#gpu_print.c \
#gpu_print.h \
#gpu_tree.c \
#gpu_tree.h


================================================
FILE: src/README
================================================
Requirements:

- automake, autoconf, libtool
	(not needed when compiling a release)
- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config)
	(not needed when compiling a release using the included isl and pet)
- gmp (http://gmplib.org/)
- libyaml (http://pyyaml.org/wiki/LibYAML)
	(only needed if you want to compile the pet executable)
- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
	Unless you have some other reasons for wanting to use the svn version,
	it is best to install the latest release (3.9).
	For more details, see pet/README.

If you are installing on Ubuntu, then you can install the following packages:

automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm

Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
Older versions of this package did not include the required libraries.
If you are using an older version of ubuntu, then you need to compile and
install LLVM/clang from source.


Preparing:

Grab the latest release and extract it or get the source from
the git repository as follows.  This process requires autoconf,
automake, libtool and pkg-config.

	git clone git://repo.or.cz/ppcg.git
	cd ppcg
	./get_submodules.sh
	./autogen.sh


Compilation:

	./configure
	make
	make check

If you have installed any of the required libraries in a non-standard
location, then you may need to use the --with-gmp-prefix,
--with-libyaml-prefix and/or --with-clang-prefix options
when calling "./configure".


Using PPCG to generate CUDA or OpenCL code

To convert a fragment of a C program to CUDA, insert a line containing

	#pragma scop

before the fragment and add a line containing

	#pragma endscop

after the fragment.  To generate CUDA code run
	
	ppcg --target=cuda file.c

where file.c is the file containing the fragment.  The generated
code is stored in file_host.cu and file_kernel.cu.

To generate OpenCL code run

	ppcg --target=opencl file.c

where file.c is the file containing the fragment.  The generated code
is stored in file_host.c and file_kernel.cl.


Specifying tile, grid and block sizes

The iterations space tile size, grid size and block size can
be specified using the --sizes option.  The argument is a union map
in isl notation mapping kernels identified by their sequence number
in a "kernel" space to singleton sets in the "tile", "grid" and "block"
spaces.  The sizes are specified outermost to innermost.

The dimension of the "tile" space indicates the (maximal) number of loop
dimensions to tile.  The elements of the single integer tuple
specify the tile sizes in each dimension.
In case of hybrid tiling, the first element is half the size of
the tile in the time (sequential) dimension.  The second element
specifies the number of elements in the base of the hexagon.
The remaining elements specify the tile sizes in the remaining space
dimensions.

The dimension of the "grid" space indicates the (maximal) number of block
dimensions in the grid.  The elements of the single integer tuple
specify the number of blocks in each dimension.

The dimension of the "block" space indicates the (maximal) number of thread
dimensions in the grid.  The elements of the single integer tuple
specify the number of threads in each dimension.

For example,

    { kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 }

specifies that in kernel 0, two loops should be tiled with a tile
size of 64 in both dimensions and that all kernels except kernel 4
should be run using a block of 16 threads.

Since PPCG performs some scheduling, it can be difficult to predict
what exactly will end up in a kernel.  If you want to specify
tile, grid or block sizes, you may want to run PPCG first with the defaults,
examine the kernels and then run PPCG again with the desired sizes.
Instead of examining the kernels, you can also specify the option
--dump-sizes on the first run to obtain the effectively used default sizes.


Compiling the generated CUDA code with nvcc

To get optimal performance from nvcc, it is important to choose --arch
according to your target GPU.  Specifically, use the flag "--arch sm_20"
for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for
GK110 Kepler.  We discourage the use of older cards as we have seen
correctness issues with compilation for older architectures.
Note that in the absence of any --arch flag, nvcc defaults to
"--arch sm_13". This will not only be slower, but can also cause
correctness issues.
If you want to obtain results that are identical to those obtained
by the original code, then you may need to disable some optimizations
by passing the "--fmad=false" option.


Compiling the generated OpenCL code with gcc

To compile the host code you need to link against the file
ocl_utilities.c which contains utility functions used by the generated
OpenCL host code.  To compile the host code with gcc, run

  gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL

Note that we have experienced the generated OpenCL code freezing
on some inputs (e.g., the PolyBench symm benchmark) when using
at least some version of the Nvidia OpenCL library, while the
corresponding CUDA code runs fine.
We have experienced no such freezes when using AMD, ARM or Intel
OpenCL libraries.

By default, the compiled executable will need the _kernel.cl file at
run time.  Alternatively, the option --opencl-embed-kernel-code may be
given to place the kernel code in a string literal.  The kernel code is
then compiled into the host binary, such that the _kernel.cl file is no
longer needed at run time.  Any kernel include files, in particular
those supplied using --opencl-include-file, will still be required at
run time.


Function calls

Function calls inside the analyzed fragment are reproduced
in the CUDA or OpenCL code, but for now it is left to the user
to make sure that the functions that are being called are
available from the generated kernels.

In the case of OpenCL code, the --opencl-include-file option
may be used to specify one or more files to be #include'd
from the generated code.  These files may then contain
the definitions of the functions being called from the
program fragment.  If the pathnames of the included files
are relative to the current directory, then you may need
to additionally specify the --opencl-compiler-options=-I.
to make sure that the files can be found by the OpenCL compiler.
The included files may contain definitions of types used by the
generated kernels.  By default, PPCG generates definitions for
types as needed, but these definitions may collide with those in
the included files, as PPCG does not consider the contents of the
included files.  The --no-opencl-print-kernel-types will prevent
PPCG from generating type definitions.


GNU extensions

By default, PPCG may print out macro definitions that involve
GNU extensions such as __typeof__ and statement expressions.
Some compilers may not support these extensions.
In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
has been reported not to support __typeof__.
The use of these extensions can be turned off with the
--no-allow-gnu-extensions option.


Processing PolyBench

When processing a PolyBench/C 3.2 benchmark, you should always specify
-DPOLYBENCH_USE_C99_PROTO on the ppcg command line.  Otherwise, the source
files are inconsistent, having fixed size arrays but parametrically
bounded loops iterating over them.
However, you should not specify this define when compiling
the PPCG generated code using nvcc since CUDA does not support VLAs.


CUDA and function overloading

While CUDA supports function overloading based on the arguments types,
no such function overloading exists in the input language C.  Since PPCG
simply prints out the same function name as in the original code, this
may result in a different function being called based on the types
of the arguments.  For example, if the original code contains a call
to the function sqrt() with a float argument, then the argument will
be promoted to a double and the sqrt() function will be called.
In the transformed (CUDA) code, however, overloading will cause the
function sqrtf() to be called.  Until this issue has been resolved in PPCG,
we recommend that users either explicitly call the function sqrtf() or
explicitly cast the argument to double in the input code.


Contact

For bug reports, feature requests and questions,
contact http://groups.google.com/group/isl-development

Whenever you report a bug, please mention the exact version of PPCG
that you are using (output of "./ppcg --version").  If you are unable
to compile PPCG, then report the git version (output of "git describe")
or the version number included in the name of the tarball.


Citing PPCG

If you use PPCG for your research, you are invited to cite
the following paper.

@article{Verdoolaege2013PPCG,
    author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
		G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
		Catthoor, Francky},
    title = {Polyhedral parallel code generation for CUDA},
    journal = {ACM Trans. Archit. Code Optim.},
    issue_date = {January 2013},
    volume = {9},
    number = {4},
    month = jan,
    year = {2013},
    issn = {1544-3566},
    pages = {54:1--54:23},
    doi = {10.1145/2400682.2400713},
    acmid = {2400713},
    publisher = {ACM},
    address = {New York, NY, USA},
}


================================================
FILE: src/autogen.sh
================================================
#!/bin/sh
autoreconf -i
if test -f isl/autogen.sh; then
	(cd isl; ./autogen.sh)
fi
if test -f barvinok/autogen.sh; then
  (cd barvinok; ./autogen.sh)
fi
if test -f pet/autogen.sh; then
	(cd pet; ./autogen.sh)
fi


================================================
FILE: src/autosa_catapult_hls_c.cpp
================================================
#include <isl/ctx.h>

#include "autosa_catapult_hls_c.h"
#include "autosa_common.h"
#include "autosa_comm.h"
#include "autosa_print.h"
#include "autosa_trans.h"
#include "autosa_codegen.h"
#include "autosa_utils.h"

#include <set>

struct print_host_user_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_top_module *top;
};

struct print_hw_module_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_module *module;
  /* Used for double buffer codegen. Modify the printed iterator prefix. */
  const char *iterator_prefix;
};

/* Open the host .cpp file and the kernel .h and .cpp files for writing.
 * Add the necessary includes.
 */
static void hls_open_files(struct hls_info *info, const char *input)
{
  char name[PATH_MAX];
  char dir[PATH_MAX];
  int len, len_dir;
  isl_printer *p_str;
  char *file_path;

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/");
  file_path = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  len = ppcg_extract_base_name(name, input);

  /* Store the prefix */
  strncpy(dir, name, len);
  dir[len] = '\0';
  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, dir);
  info->kernel_prefix = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  /* Add the prefix */
  sprintf(dir, "%s", file_path);
  len_dir = strlen(file_path);

  strcpy(name + len, "_host.cpp");
  strcpy(dir + len_dir, name);
  info->host_c = fopen(dir, "w");
  if (!info->host_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  //if (!info->hls)
  //{
  //  /* OpenCL host */
  //  strcpy(name + len, "_host.hpp");
  //  strcpy(dir + len_dir, name);
  //  info->host_h = fopen(dir, "w");
  //  print_xilinx_host_header(info->host_h);
  //  fprintf(info->host_c, "#include \"%s\"\n", name);
  //}

  strcpy(name + len, "_directives.tcl");
  strcpy(dir + len_dir, name);
  info->tcl = fopen(dir, "w");
  if (!info->tcl) 
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_kernel_modules.cpp");
  strcpy(dir + len_dir, name);
  info->kernel_c = fopen(dir, "w");
  if (!info->kernel_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_kernel.h");
  strcpy(dir + len_dir, name);
  info->kernel_h = fopen(dir, "w");
  if (!info->kernel_h)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  //fprintf(info->host_c, "#include <assert.h>\n");
  //fprintf(info->host_c, "#include <stdio.h>\n");
  fprintf(info->host_c, "#include <vector>\n");
  fprintf(info->host_c, "#include <cstdlib>\n");
  if (info->hls)
    fprintf(info->host_c, "#include \"%s\"\n", name);

  if (info->hls) {
    fprintf(info->kernel_c, "#include \"%s\"\n", name);
    //fprintf(info->kernel_c, "#include <mc_scverify.h>\n");
  }

  if (info->hls) {
    strcpy(name + len, "_kernel_hw.h");
    fprintf(info->host_c, "#include \"%s\"\n", name);
    fprintf(info->host_c, "#include <mc_scverify.h>\n\n");
  }    

  strcpy(name + len, "_top_gen.cpp");
  strcpy(dir + len_dir, name);
  info->top_gen_c = fopen(dir, "w");

  strcpy(name + len, "_top_gen.h");
  strcpy(dir + len_dir, name);
  info->top_gen_h = fopen(dir, "w");

  fprintf(info->top_gen_c, "#include <isl/printer.h>\n");
  fprintf(info->top_gen_c, "#include \"%s\"\n", name);
  
  fprintf(info->kernel_h, "#ifndef _KERNEL_H_\n");
  fprintf(info->kernel_h, "#define _KERNEL_H_\n");
  fprintf(info->kernel_h, "#include <ac_int.h>\n");
  fprintf(info->kernel_h, "#include <ac_channel.h>\n");
  fprintf(info->kernel_h, "#include <ac_float.h>\n");
  fprintf(info->kernel_h, "#include <ac_std_float.h>\n");
  fprintf(info->kernel_h, "#include <ac_math.h>\n");
  fprintf(info->kernel_h, "\n");

  fprintf(info->kernel_h, "#define min(x,y) ((x < y) ? x : y)\n");
  fprintf(info->kernel_h, "#define max(x,y) ((x > y) ? x : y)\n");
  fprintf(info->kernel_h, "\n");

  free(file_path);
}

/* Close all output files.
 */
static void hls_close_files(struct hls_info *info)
{
  isl_printer *p_str;
  char *complete;
  FILE *f;

  fprintf(info->kernel_h, "#endif\n\n");

  fclose(info->kernel_c);
  fclose(info->kernel_h);
  fclose(info->host_c);
  if (!info->hls)
  {
    fclose(info->host_h);
  }
  fclose(info->top_gen_c);
  fclose(info->top_gen_h);
  fclose(info->tcl);
  free(info->kernel_prefix);

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/completed");
  complete = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  f = fopen(complete, "w");
  fclose(f);
  free(complete);
}

/* Extract the data pack factors for each I/O buffer allocated for the current
 * I/O group.
 * Only insert the data pack factor that is not found in the current list
 * "data_pack_factors".
 * The list is in ascending order.
 */
static int *extract_data_pack_factors(int *data_pack_factors,
                                      int *n_factor, struct autosa_array_ref_group *group)
{
  /* Test if the group default packing factor needs to be inserted */
  if (group->n_lane > 1)
  {    
    int n_lane = group->n_lane;
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (insert) {
      *n_factor = *n_factor + 1;
      data_pack_factors = (int *)realloc(data_pack_factors,
                                         sizeof(int) * (*n_factor));
      for (int j = *n_factor - 1; j > pos; j--)
      {
        data_pack_factors[j] = data_pack_factors[j - 1];
      }
      data_pack_factors[pos] = n_lane;
    }
  }

  for (int i = 0; i < group->n_io_buffer; i++)
  {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (buf->n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (buf->n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (buf->n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (!insert)
      continue;

    *n_factor = *n_factor + 1;
    data_pack_factors = (int *)realloc(data_pack_factors,
                                       sizeof(int) * (*n_factor));
    for (int j = *n_factor - 1; j > pos; j--)
    {
      data_pack_factors[j] = data_pack_factors[j - 1];
    }
    data_pack_factors[pos] = buf->n_lane;
  }

  return data_pack_factors;
}

/* Examine the local buffers of each array group. 
 * Extract the data pack factors and build the data types 
 * required by the program. 
 */
static isl_stat print_data_types_catapult(
  struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_printer *p;
  struct autosa_kernel *kernel;

  kernel = top->kernel;
  p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "/* Data Type */");
  
  /* Print the primitive data type. */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    if (!strcmp(local->array->type, "float")) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "typedef ac_ieee_float<binary32> ");
      p = isl_printer_print_str(p, local->array->name);
      p = isl_printer_print_str(p, "_t1;");
      p = isl_printer_end_line(p);
    } else if (!strcmp(local->array->type, "unsigned short")) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "typedef ac_int<");
      p = isl_printer_print_int(p, local->array->size * 8);
      p = isl_printer_print_str(p, ",false> ");
      p = isl_printer_print_str(p, local->array->name);
      p = isl_printer_print_str(p, "_t1;");
      p = isl_printer_end_line(p);      
    } else if (!strcmp(local->array->type, "unsigned int")) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "typedef ac_int<");
      p = isl_printer_print_int(p, local->array->size * 8);
      p = isl_printer_print_str(p, ",false> ");
      p = isl_printer_print_str(p, local->array->name);
      p = isl_printer_print_str(p, "_t1;");
      p = isl_printer_end_line(p);      
    } else {
      printf("[AutoSA] Warning: The primitive data type is not converted to Catapult data type.\n");
      continue;
    }
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    int *data_pack_factors = (int *)malloc(sizeof(int));
    int n_factor = 1;
    /* First insert the default data pack factor for the array. */
    data_pack_factors[0] = local->n_lane;    

    /* IO group */
    for (int n = 0; n < local->n_io_group; n++)
    {
      struct autosa_array_ref_group *group = local->io_groups[n];
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, group);
    }
    /* Drain group */
    if (local->drain_group)
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, local->drain_group);

    if (local->is_sparse) {
      std::set<int> tmp_lanes;
      for (int n = 0; n < n_factor; n++) {
        tmp_lanes.insert(data_pack_factors[n] * kernel->n_nzero);
        tmp_lanes.insert(data_pack_factors[n]);
      }
      for (auto it = tmp_lanes.begin(); it != tmp_lanes.end(); ++it) {
        int f = *it;
        if (local->array->size * 8 * f > 1024) {
          printf("[AutoSA] Warning: The data width %d is greater than 1024-bit. The type definition is not generated.\n", local->array->size * 8 * f);
          continue;
        }
        if (f > 1) {
          p = isl_printer_start_line(p);
          //p = isl_printer_print_str(p, "typedef ap_uint<");
          p = isl_printer_print_str(p, "typedef ac_int<");
          p = isl_printer_print_int(p, local->array->size * 8 * f);
          p = isl_printer_print_str(p, ",false");
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, f);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }

      for (int n = 0; n < n_factor; n++) {
        if (data_pack_factors[n] * kernel->n_nzero * local->array->size * 8 > 1024)
          continue;
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "typedef struct ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, " {");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, 2);
        
        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, local->array->type);
        } else {
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n] * kernel->n_nzero);
        }
        p = isl_printer_print_str(p, " d;");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, "unsigned char");  
        } else {
          //p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_str(p, "ac_int<");
          p = isl_printer_print_int(p, 8 * data_pack_factors[n]);
          p = isl_printer_print_str(p, ",false");
          p = isl_printer_print_str(p, ">");
        }
        p = isl_printer_print_str(p, " i;");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "} ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    } else {
      for (int n = 0; n < n_factor; n++)
      {
        if (data_pack_factors[n] != 1)
        {
          int width;
          width = local->array->size * 8 * data_pack_factors[n];
          p = isl_printer_start_line(p);
          //p = isl_printer_print_str(p, "typedef ap_uint<");
          p = isl_printer_print_str(p, "typedef ac_int<");
          p = isl_printer_print_int(p, width);
          p = isl_printer_print_str(p, ",false");
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n]);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }
    }
    free(data_pack_factors);    
  }
  p = print_str_new_line(p, "/* Data Type */");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *declare_and_allocate_cpu_arrays_catapult(
  __isl_take isl_printer *p, struct autosa_prog *prog, 
  struct autosa_kernel *kernel, struct autosa_hw_top_module *top)
{
  p = print_str_new_line(p, "// Allocate memory in host memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      /* Create multiple host buffers. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp");
      p = isl_printer_print_str(p, " = (");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *)malloc(");
      p = autosa_array_info_print_data_size(p, local_array->array);      
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".push_back(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (local_array->host_serialize) {
        /* Allocate additional serialize buffer. */
        /* Create multiple host buffers. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);      
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp");
        p = isl_printer_print_str(p, " = (");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *)malloc(");
        //p = autosa_array_info_print_data_size(p, local_array->array);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, " * sizeof(");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, ".push_back(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }
    else
    {
      /* Create a single host buffer. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, " = (");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *)malloc(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        /* Create a single host buffer. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *dev_");
        p = isl_printer_print_str(p, local_array->array->name);       
        p = isl_printer_print_str(p, " = (");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *)malloc(");        
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, " * sizeof(");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
      }
    }    
  }
  p = isl_printer_end_line(p);

  /* Initialize buffer. */
  p = print_str_new_line(p, "// Initialize host buffers");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "memcpy(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "[i]");      
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ", ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "memcpy(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ", ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);
    }
  }
  
  /* Perform data serialization if needed. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        p = isl_printer_start_line(p);        
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);            
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);  // TODO: add hbm support later.
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      } else 
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }  
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Allocate buffers in device memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "std::vector<");
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    int indent1, indent2;
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp = (");
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *)malloc(");
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_data_size(p, local_array->array);
    } else {
      p = autosa_array_info_print_data_size(p, local_array->array);
    }
    p = isl_printer_print_str(p, " / ");
    p = isl_printer_print_int(p, local_array->array->n_lane);
    p = isl_printer_print_str(p, " * sizeof(");
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, "));");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ".push_back(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp);");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  p = isl_printer_end_line(p);

  return p;
}

/* Print code for initializing the device for execution of the transformed
 * code. This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device_catapult(__isl_take isl_printer *p,
                                                    struct autosa_prog *prog, 
                                                    struct autosa_kernel *kernel, 
                                                    int hls,
                                                    struct autosa_hw_top_module *top)
{
  p = autosa_print_local_declarations(p, prog);
  //if (!hls)
  //{
  //  p = find_device_catapult(p);
  //  p = declare_and_allocate_device_arrays_catapult(p, prog, kernel, top);
  //}
  //else
  //{
  p = declare_and_allocate_cpu_arrays_catapult(p, prog, kernel, top);
  //}

  return p;
}

static __isl_give isl_printer *autosa_free_cpu_arrays_catapult(
  __isl_take isl_printer *p, struct autosa_prog *prog, struct autosa_kernel *kernel)
{
  p = print_str_new_line(p, "// Clean up resources");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "free(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "[i]);");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "free(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "[i]);");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "free(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_unserialized");
        p = isl_printer_print_str(p, "[i]);");
        p = isl_printer_end_line(p);
      }

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "free(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "free(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_unserialized");
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }

  return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device_catapult(__isl_take isl_printer *p,
                                                   struct autosa_prog *prog, 
                                                   struct autosa_kernel *kernel, 
                                                   int hls,
                                                   struct autosa_hw_top_module *top)
{  
  /* Deserialize the buffer data if necessary. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && !module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "host_deserialize_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "(");      
      p = print_host_serialize_arguments(p, top->kernel, group, module, 0, 0);  // TODO: add hbm support later.
      p = isl_printer_print_str(p, ");");      
      p = isl_printer_end_line(p);
    }
  }

  if (hls)
  {
    /* Restore buffer */
    p = print_str_new_line(p, "// Restore data from host buffers");
    for (int i = 0; i < prog->n_array; i++)
    {
      struct autosa_array_info *array = &prog->array[i];
      if (!autosa_array_requires_device_allocation(array))
        continue;

      if (array->copy_out)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "memcpy(");
        p = isl_printer_print_str(p, array->name);
        p = isl_printer_print_str(p, ", dev_");
        p = isl_printer_print_str(p, array->name);
        if (array->local_array->host_serialize) {
          p = isl_printer_print_str(p, "_unserialized");
        }
        if (array->local_array->n_mem_ports > 1)
        {
          p = isl_printer_print_str(p, "[0]");
        }
        p = isl_printer_print_str(p, ", ");
        p = autosa_array_info_print_size(p, array);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    p = isl_printer_end_line(p);
    p = autosa_free_cpu_arrays_catapult(p, prog, kernel);
  }
  //else
  //{
  //  /* Restore buffer */
  //  p = print_str_new_line(p, "// Restore data from host buffers");
  //  for (int i = 0; i < prog->n_array; i++)
  //  {
  //    struct autosa_array_info *array = &prog->array[i];
  //    if (!autosa_array_requires_device_allocation(array))
  //      continue;
//
  //    if (array->copy_out)
  //    {
  //      p = isl_printer_start_line(p);
  //      p = isl_printer_print_str(p, "std::copy(dev_");
  //      p = isl_printer_print_str(p, array->name);
  //      if (array->local_array->host_serialize) {
  //        p = isl_printer_print_str(p, "_unserialized");
  //      }
  //      if (array->local_array->n_mem_ports > 1)
  //      {
  //        p = isl_printer_print_str(p, "[0]");
  //      }
  //      p = isl_printer_print_str(p, ".begin(), dev_");
  //      p = isl_printer_print_str(p, array->name);
  //      if (array->local_array->host_serialize) {
  //        p = isl_printer_print_str(p, "_unserialized");
  //      }
  //      if (array->local_array->n_mem_ports > 1)
  //      {
  //        p = isl_printer_print_str(p, "[0]");
  //      }
  //      p = isl_printer_print_str(p, ".end(), reinterpret_cast<");
  //      p = isl_printer_print_str(p, array->type);
  //      p = isl_printer_print_str(p, " *>(");
  //      p = isl_printer_print_str(p, array->name);
  //      p = isl_printer_print_str(p, "));");
  //      p = isl_printer_end_line(p);
  //    }
  //  }
  //}

  return p;
}

static __isl_give isl_printer *drain_merge_catapult(
  __isl_take isl_printer *p, struct autosa_prog *prog,
  struct autosa_drain_merge_func *func,
  int hls)
{
  struct autosa_array_ref_group *group = func->group;
  p = print_str_new_line(p, "// Merge results");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int idx = ");
  p = isl_printer_print_int(p, group->mem_port_id);
  p = isl_printer_print_str(p, "; idx < ");
  p = isl_printer_print_int(p, group->mem_port_id + group->n_mem_ports);
  p = isl_printer_print_str(p, "; idx++) {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = autosa_array_ref_group_print_prefix(group, p);
  p = isl_printer_print_str(p, "_drain_merge(");
  p = print_drain_merge_arguments(p, func->kernel, group, func, 0, hls);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);
  return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_to_device_catapult(
  __isl_take isl_printer *p,
  struct autosa_array_info *array, int hls)
{
  int indent;

  struct autosa_local_array_info *local_array = array->local_array;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int i = 0; i < ");
  p = isl_printer_print_int(p, local_array->n_mem_ports);
  p = isl_printer_print_str(p, "; i++) {");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);

  //p = isl_printer_start_line(p);
  //p = isl_printer_print_str(p, "memcpy(buffer_");
  //p = isl_printer_print_str(p, array->name);
  //p = isl_printer_print_str(p, "[i], dev_");
  //p = isl_printer_print_str(p, array->name);
  //if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
  //{
  //  p = isl_printer_print_str(p, "[i]");
  //}
  //p = isl_printer_print_str(p, ", ");
  //if (local_array->host_serialize) {
  //  p = autosa_array_info_print_serialize_size(p, array);
  //} else {
  //  p = autosa_array_info_print_size(p, array);
  //}
  //p = isl_printer_print_str(p, ");");
  //p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int c0 = 0; c0 < ");
  if (local_array->host_serialize) {
    p = autosa_array_info_print_serialize_data_size(p, array);
  } else {
    p = autosa_array_info_print_data_size(p, array);
  }
  p = isl_printer_print_str(p, " / ");
  p = isl_printer_print_int(p, array->n_lane);
  p = isl_printer_print_str(p, "; c0++) {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = autosa_print_array_type(p, array);
  p = isl_printer_print_str(p, " tmp;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int c1 = 0; c1 < ");
  p = isl_printer_print_int(p, array->n_lane);
  p = isl_printer_print_str(p, "; c1++) {");
  p = isl_printer_end_line(p);
  
  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "tmp.set_slc(c1 * ");
  p = isl_printer_print_int(p, array->size * 8);
  p = isl_printer_print_str(p, ", (");
  p = isl_printer_print_str(p, array->name);
  p = isl_printer_print_str(p, "_t1)dev_");
  p = isl_printer_print_str(p, array->name);
  if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
  {
    p = isl_printer_print_str(p, "[i]");
  }
  p = isl_printer_print_str(p, "[c0 * ");
  p = isl_printer_print_int(p, array->n_lane);
  p = isl_printer_print_str(p, " + c1]);");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "buffer_");
  p = isl_printer_print_str(p, array->name);
  p = isl_printer_print_str(p, "[i][c0] = tmp;");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);  

  return p;
}

/* Print code to "p" for copying "array" back from the device to the host
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * polysa_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_from_device_catapult(
  __isl_take isl_printer *p, struct autosa_array_info *array, int hls)
{
  struct autosa_local_array_info *local_array;
  int indent;

  local_array = array->local_array;
  //if (!hls)
  //{
  //  p = isl_printer_start_line(p);
  //  p = isl_printer_print_str(p, "for (int i = 0; i < ");
  //  p = isl_printer_print_int(p, local_array->n_io_group_refs);
  //  p = isl_printer_print_str(p, "; i++) {");
  //  p = isl_printer_end_line(p);
  //  p = isl_printer_indent(p, 2);
//
  //  p = print_str_new_line(p, "OCL_CHECK(err,");
  //  indent = strlen("OCL_CHECK(");
  //  p = isl_printer_indent(p, indent);
  //  p = isl_printer_start_line(p);
  //  p = isl_printer_print_str(p, "err = q.enqueueMigrateMemObjects({buffer_");
  //  p = isl_printer_print_str(p, array->name);
  //  p = isl_printer_print_str(p, "[i]");
  //  p = isl_printer_print_str(p, "}, CL_MIGRATE_MEM_OBJECT_HOST));");
  //  p = isl_printer_end_line(p);
  //  p = isl_printer_indent(p, -indent);
//
  //  p = isl_printer_indent(p, -2);
  //  p = print_str_new_line(p, "}");
  //}
  //else
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    //p = isl_printer_start_line(p);
    //p = isl_printer_print_str(p, "memcpy(dev_");
    //p = isl_printer_print_str(p, array->name);
    //if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    //{
    //  p = isl_printer_print_str(p, "[i]");
    //}
    //p = isl_printer_print_str(p, ", buffer_");
    //p = isl_printer_print_str(p, array->name);
    //p = isl_printer_print_str(p, "[i], ");
    //if (local_array->host_serialize) {
    //  p = autosa_array_info_print_serialize_size(p, array);
    //} else {
    //  p = autosa_array_info_print_size(p, array);
    //}
    //p = isl_printer_print_str(p, ");");
    //p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int c0 = 0; c0 < ");
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_data_size(p, array);
    } else {
      p = autosa_array_info_print_data_size(p, array);
    }
    p = isl_printer_print_str(p, " / ");
    p = isl_printer_print_int(p, array->n_lane);
    p = isl_printer_print_str(p, "; c0++) {");
    p = isl_printer_end_line(p);   

    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);
    p = autosa_print_array_type(p, array);
    p = isl_printer_print_str(p, " tmp = buffer_");
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "[i][c0];");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int c1 = 0; c1 < ");
    p = isl_printer_print_int(p, array->n_lane);
    p = isl_printer_print_str(p, "; c1++) {");
    p = isl_printer_end_line(p); 

    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, array->name);
    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      p = isl_printer_print_str(p, "[i]");
    }
    p = isl_printer_print_str(p, "[c0 * ");
    p = isl_printer_print_int(p, array->n_lane);
    p = isl_printer_print_str(p, " + c1] = (");
    p = isl_printer_print_str(p, array->type);
    p = isl_printer_print_str(p, ")tmp.slc<");
    p = isl_printer_print_int(p, array->size * 8);
    p = isl_printer_print_str(p, ">(");
    p = isl_printer_print_int(p, array->size * 8);
    p = isl_printer_print_str(p, " * c1);");
    p = isl_printer_end_line(p); 

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");    

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");    

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
    p = isl_printer_end_line(p);    
  }

  return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the autosa_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node_catapult(__isl_take isl_printer *p,
                                                          __isl_keep isl_ast_node *node, 
                                                          struct autosa_prog *prog, 
                                                          int hls,
                                                          struct autosa_hw_top_module *top)
{
  isl_ast_expr *expr, *arg;
  isl_id *id;
  const char *name;
  struct autosa_array_info *array;
  struct autosa_kernel *kernel;
  struct autosa_drain_merge_func *func;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  if (!strcmp(name, "init_device") || !strcmp(name, "clear_device"))
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
  else if (!strcmp(name, "drain_merge"))
    func = (struct autosa_drain_merge_func *)isl_id_get_user(id);
  else
    array = (struct autosa_array_info *)isl_id_get_user(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  isl_ast_expr_free(expr);

  if (!name)
    return isl_printer_free(p);
  if (!strcmp(name, "init_device"))
    return init_device_catapult(p, prog, kernel, hls, top);
  if (!strcmp(name, "clear_device"))
    return clear_device_catapult(p, prog, kernel, hls, top);
  if (!strcmp(name, "drain_merge"))
    return drain_merge_catapult(p, prog, func, hls);
  if (!array)
    return isl_printer_free(p);

  if (!prefixcmp(name, "to_device"))
    return copy_array_to_device_catapult(p, array, hls);
  else
    return copy_array_from_device_catapult(p, array, hls);

  return p;
}

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user_catapult(__isl_take isl_printer *p,
                                                        __isl_take isl_ast_print_options *print_options,
                                                        __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int is_user;
  struct autosa_kernel *kernel;
  struct autosa_kernel_stmt *stmt;
  struct print_host_user_data *data;
  struct hls_info *hls;
  struct autosa_hw_top_module *top;

  isl_ast_print_options_free(print_options);

  data = (struct print_host_user_data *)user;
  hls = data->hls;
  top = data->top;

  id = isl_ast_node_get_annotation(node);
  if (!id)
  {
    return print_device_node_catapult(p, node, data->prog, hls->hls, top);
  }

  is_user = !strcmp(isl_id_get_name(id), "user");
  kernel = is_user ? NULL : (struct autosa_kernel *)isl_id_get_user(id);
  stmt = is_user ? (struct autosa_kernel_stmt *)isl_id_get_user(id) : NULL;
  isl_id_free(id);

  if (is_user)
    return autosa_kernel_print_domain(p, stmt);

  //if (!hls->hls)
  //{
  //  /* Print OpenCL host. */
  //  p = ppcg_start_block(p);
//
  //  p = print_set_kernel_arguments_xilinx(p, data->prog, kernel);
  //  p = print_str_new_line(p, "q.finish();");
  //  p = print_str_new_line(p, "fpga_begin = std::chrono::high_resolution_clock::now();");
  //  p = isl_printer_end_line(p);
  //  p = print_str_new_line(p, "// Launch the kernel");
  //  p = print_str_new_line(p, "OCL_CHECK(err, err = q.enqueueTask(krnl));");
  //  p = isl_printer_end_line(p);
  //  p = print_str_new_line(p, "q.finish();");
  //  p = print_str_new_line(p, "fpga_end = std::chrono::high_resolution_clock::now();");
//
  //  p = ppcg_end_block(p);
  //  p = isl_printer_end_line(p);
  //}
  //else
  //{
    /* Print HLS host. */
    p = ppcg_start_block(p);

    p = print_str_new_line(p, "// Launch the kernel");
    p = print_str_new_line(p, "kernel0 kernel0_inst;");

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "kernel");    
    p = isl_printer_print_int(p, 0);
    p = isl_printer_print_str(p, "_inst.run(");
    p = print_kernel_arguments(p, data->prog, kernel, 0, hls);
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = ppcg_end_block(p);
  //}
  /* Print the top kernel header. */
  //print_kernel_headers_catapult(data->prog, kernel, data->hls);

  return p;
}

static __isl_give isl_printer *print_module_core_header_catapult(
  __isl_take isl_printer *p,
  struct autosa_prog *prog, struct autosa_hw_module *module,
  int inter, int boundary, int serialize, int types)
{
  int n = isl_id_list_n_id(module->inst_ids);

  p = isl_printer_start_line(p);  
  if (types)
    p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, "CCS_BLOCK(run)");
  p = isl_printer_print_str(p, "(");
  if (!types) {
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);  
  }
  p = print_module_arguments(p, prog, module->kernel, module, types,
                             CATAPULT_HW, inter, -1, boundary, serialize);
  p = isl_printer_print_str(p, ")");
  if (!types) {
    p = isl_printer_indent(p, -2);
  }

  return p;
}

/* Print out variable declarations on Xilinx platforms.
 * The local variable can be mapped to different memory resources:
 * FF, LUTRAM, BRAM, URAM.
 */
static __isl_give isl_printer *print_module_var_catapult(
    __isl_take isl_printer *p,
    struct autosa_kernel_var *var, int double_buffer,
    struct autosa_hw_module *module)
{
  int j;
  int use_memory = 0; // 0: FF 1: LUTRAM 2: BRAM 3: URAM
  use_memory = extract_memory_type(module, var, module->options->autosa->uram);

  p = isl_printer_start_line(p);
  if (var->array->local_array->is_sparse && module->type != PE_MODULE) {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, var->n_lane);
  } else {
    //if (var->n_lane == 1)
    //  p = isl_printer_print_str(p, var->array->type);
    //else {
      p = isl_printer_print_str(p, var->array->name);    
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    //}
  }
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, var->name);
  if (double_buffer)
    p = isl_printer_print_str(p, "_ping");
  for (j = 0; j < isl_vec_size(var->size); ++j)
  {
    isl_val *v;

    p = isl_printer_print_str(p, "[");
    v = isl_vec_get_element_val(var->size, j);
    p = isl_printer_print_val(p, v);
    isl_val_free(v);
    p = isl_printer_print_str(p, "]");
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  /* Print pong buffer */
  if (double_buffer)
  {
    p = isl_printer_start_line(p);
    if (var->array->local_array->is_sparse) {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_s_t");      
      p = isl_printer_print_int(p, var->n_lane);      
    } else {
      if (var->n_lane == 1)
        p = isl_printer_print_str(p, var->array->type);
      else {
        p = isl_printer_print_str(p, var->array->name);        
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, var->n_lane);
      }
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_pong");
    for (j = 0; j < isl_vec_size(var->size); ++j)
    {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  return p;
}

static __isl_give isl_printer *print_module_vars_catapult(
  __isl_take isl_printer *p, struct autosa_hw_module *module, int inter)
{
  int i, n;
  isl_space *space;
  const char *type;

  if (inter == -1)
  {
    for (i = 0; i < module->n_var; ++i)
      p = print_module_var_catapult(p, &module->var[i], module->double_buffer, module);
  }  

  return p;
}

static __isl_give isl_printer *print_for_with_pipeline(
  __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma hls_pipeline_init_interval 1");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_with_unroll(
  __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma unroll yes");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_with_guard(
  __isl_take isl_ast_node *node, __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  int pipeline, int unroll,
  int guard_start, int guard_end,
  char **fifo_names, isl_pw_qpolynomial **bounds, int n_fifo,
  int double_buffer, int inter, int read,
  char *module_name, char *buf_name
  )
{  
  if (guard_start) {
    p = isl_printer_print_str(p, "#ifndef __SYNTHESIS__");
    p = isl_printer_end_line(p);    

    p = print_str_new_line(p, "// while () // Please add the fifo check for C sim.");
    //if (n_fifo > 0) {
    //  p = isl_printer_start_line(p);
    //  p = isl_printer_print_str(p, "while (");
    //  //for (int i = 0; i < n_fifo; i++) {
    //  //  if (i > 0)
    //  //    p = isl_printer_print_str(p, " && ");
    //  //  p = isl_printer_print_str(p, fifo_names[i]);
    //  //  p = isl_printer_print_str(p, ".available(");
    //  //  p = isl_printer_print_pw_qpolynomial(p, bounds[i]);
    //  //  p = isl_printer_print_str(p, ")");
    //  //}
    //  p = isl_printer_print_str(p, ")");
    //  p = isl_printer_end_line(p);
    //}
  }

  //p = isl_printer_indent(p, 2);
  if (pipeline) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma hls_pipeline_init_interval 1");
    p = isl_printer_end_line(p);
  }
  if (unroll) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma unroll yes");
    p = isl_printer_end_line(p);
  }

  if (!guard_end) {
    p = isl_ast_node_for_print(node, p, print_options);   
    //p = isl_printer_indent(p, -2); 
  } else {
    isl_ast_expr *iterator, *init, *cond, *inc;
    isl_ast_node *body;
    const char *iter_type;
    iterator = isl_ast_node_for_get_iterator(node);
    init = isl_ast_node_for_get_init(node);
    cond = isl_ast_node_for_get_cond(node);
    inc = isl_ast_node_for_get_inc(node);
    body = isl_ast_node_for_get_body(node);
    iter_type = isl_options_get_ast_iterator_type(isl_ast_node_get_ctx(node));
    
    //p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (");
    p = isl_printer_print_str(p, iter_type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_ast_expr(p, iterator);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_ast_expr(p, init);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_ast_expr(p, cond);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_ast_expr(p, iterator);
    p = isl_printer_print_str(p, " += ");
    p = isl_printer_print_ast_expr(p, inc);
    p = isl_printer_print_str(p, ")");
    p = isl_printer_end_line(p);

    p = isl_printer_print_str(p, "#endif");
    p = isl_printer_end_line(p);

    p = ppcg_start_block(p);

    /* Add the double buffer logic if needed. */    
    if (inter == 0 || inter == 1) {      
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, module_name);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, buf_name);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, buf_name);
      p = isl_printer_print_str(p, "_tmp;");
      p = isl_printer_end_line(p);

      if (read) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, buf_name);
        p = isl_printer_print_str(p, "_tmp = ");
        p = isl_printer_print_str(p, buf_name);
        p = isl_printer_print_str(p, ".read();");
        p = isl_printer_end_line(p);      
      }
    }    

    //p = isl_printer_indent(p, 2);  
    p = isl_ast_node_print(body, p, print_options);    
    //p = isl_printer_indent(p, -2);  
        
    if (inter == 0 || inter == 1) {      
      if (!read) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, buf_name);
        p = isl_printer_print_str(p, ".write(");
        p = isl_printer_print_str(p, buf_name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);      
      }
    }

    p = ppcg_end_block(p);

    isl_ast_expr_free(iterator);
    isl_ast_expr_free(init);
    isl_ast_expr_free(cond);
    isl_ast_expr_free(inc);
    isl_ast_node_free(body);
  }

  return p;
}

static __isl_give isl_printer *print_for_catapult(__isl_take isl_printer *p,
                                                  __isl_take isl_ast_print_options *print_options,
                                                  __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int pipeline;
  int unroll;
  int guard_start;
  int guard_end;
  /* for catapult fifos */
  int n_fifo;
  char **fifo_names;
  isl_pw_qpolynomial **bounds;
  int double_buffer, inter, read;
  char *module_name, *buf_name;

  pipeline = 0;
  unroll = 0;
  guard_start = 0;
  guard_end = 0;
  id = isl_ast_node_get_annotation(node);
  n_fifo = 0;
  fifo_names = NULL;
  bounds = NULL;
  double_buffer = 0;
  inter = -1;
  read = -1;
  module_name = NULL;
  buf_name = NULL;

  if (id)
  {
    struct autosa_ast_node_userinfo *info;

    info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
    if (info && info->is_pipeline)
      pipeline = 1;
    if (info && info->is_unroll)
      unroll = 1;
    if (info && info->is_guard_start)
      guard_start = 1;
    if (info && info->is_guard_end) {
      guard_end = 1;
      if (info->inter >= 0) {
        double_buffer = info->double_buffer;
        inter = info->inter;
        read = info->read;
        module_name = info->module_name;
        buf_name = info->buf_name;
      }
    }
  }

  if (guard_start || guard_end)
    p = print_for_with_guard(
            node, p, print_options, pipeline, unroll, 
            guard_start, guard_end,
            fifo_names, bounds, n_fifo,
            double_buffer, inter, read, module_name, buf_name);
  else if (pipeline)
    p = print_for_with_pipeline(node, p, print_options);
  else if (unroll)
    p = print_for_with_unroll(node, p, print_options);
  else
    p = isl_ast_node_for_print(node, p, print_options);

  isl_id_free(id);

  return p;
}

/* Prints out the rest of the fields in the class for Catapult HLS. 
 * If the function holds the inter and intra trans modules, prints out 
 * a private filed containing the function decls.
 * 
 */
static __isl_give isl_printer *print_module_fields_catapult(
  __isl_take isl_printer *p, struct autosa_prog *prog,
  struct autosa_hw_module *module, struct hls_info *hls,
  int inter, int boundary, int serialize, int types) 
{
  p = print_str_new_line(p, "}");

  // TODO: More to be printed out for other functions
  if (inter == -1 && module->is_filter && module->is_buffer) {
    /* Print the inter/intra trans modules and the buffer. */
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "private:");
    p = isl_printer_indent(p, 2);
    /* inter trans module */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->name);
    p = isl_printer_print_str(p, "_inter_trans");    
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");    
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, module->name);
    p = isl_printer_print_str(p, "_inter_trans");
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");    
    p = isl_printer_print_str(p, "_inst;");
    p = isl_printer_end_line(p);

    /* intra trans module */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->name);
    p = isl_printer_print_str(p, "_intra_trans ");
    p = isl_printer_print_str(p, module->name);
    p = isl_printer_print_str(p, "_intra_trans_inst;");
    p = isl_printer_end_line(p);    

    /* buffer */
    for (int i = 0; i < module->n_var; i++) {
      struct autosa_kernel_var *var;
      var = (struct autosa_kernel_var *)&module->var[i];
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "ac_channel<");
      p = isl_printer_print_str(p, module->name);      
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "> ");
      p = isl_printer_print_str(p, module->name);      
      //if (boundary)
      //  p = isl_printer_print_str(p, "_boundary");    
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "_inst;");
      p = isl_printer_end_line(p);
    }    
  } 

  p = isl_printer_indent(p, -2);
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "};");  

  return p;
}

static __isl_give isl_printer *print_module_core_headers_catapult(
  __isl_take isl_printer *p, struct autosa_prog *prog, 
  struct autosa_hw_module *module, struct hls_info *hls,
  int inter, int boundary, int serialize, int types)
{
  int n = isl_id_list_n_id(module->inst_ids);  

  if (types) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "class ");
    p = isl_printer_print_str(p, module->name);
    if (inter == 0)
      p = isl_printer_print_str(p, "_intra_trans");
    if (inter == 1)
      p = isl_printer_print_str(p, "_inter_trans");
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");
    if (serialize)
      p = isl_printer_print_str(p, "_serialize");
    p = isl_printer_print_str(p, " {");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, 2);
    p = print_str_new_line(p, "public:");

    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->name);
    if (inter == 0)
      p = isl_printer_print_str(p, "_intra_trans");
    if (inter == 1)
      p = isl_printer_print_str(p, "_inter_trans");
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");
    if (serialize)
      p = isl_printer_print_str(p, "_serialize");
    p = isl_printer_print_str(p, "() {}");
    p = isl_printer_end_line(p);

    p = print_str_new_line(p, "#pragma hls_design interface");
    if ((inter == -1 && module->pipeline_at_default_func && !serialize && !module->is_filter) ||
        (inter == -1 && module->pipeline_at_filter_func[0] && module->is_filter) ||
        (inter == 0 && module->pipeline_at_filter_func[1]) ||
        (inter == 1 && module->pipeline_at_filter_func[2]))
      p = print_str_new_line(p, "#pragma hls_pipeline_init_interval 1");
    p = print_module_core_header_catapult(p, prog, module, inter, boundary, serialize, 1);
    p = isl_printer_print_str(p, " {");
    p = isl_printer_end_line(p);
  } else {
    // TODO
  }

  return p;
}

/* Print the serializaztion module that connects the external memory to the 
 * top-level I/O module. 
 */
static __isl_give isl_printer *autosa_print_serialize_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{  
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);  

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  if (hls->target == CATAPULT_HW)
    p = print_module_core_headers_catapult(p, prog, module, hls, -1, boundary, 1, 1); // TODO  
  
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);    
  }
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  p = isl_printer_print_str(p, "#ifndef __SYNTHESIS__");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// while () // Please add the fifo check for C sim.");
  p = isl_printer_print_str(p, "#endif");
  p = isl_printer_end_line(p);
  
  p = print_module_serialize_body(p, module, hls);
  p = isl_printer_indent(p, -2);  
  if (hls->target == CATAPULT_HW)
    p = print_module_fields_catapult(p, prog, module, hls, -1, boundary, 1, 1);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

/* Print the default module. 
 * For PE modules, we will print a wrapper function to speedup the HLS 
 * synthesis. 
 * For the rest of the modules, wrapper is disabled. 
 */
static __isl_give isl_printer *autosa_print_default_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;
  } else {
    if (!module->boundary_tree)
      return p;
  }    

  //bool wrapper = 0;
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  
  ///* Print wrapper for PE and L1 IO module */
  //if (module->type == PE_MODULE || (module->type != PE_MODULE && module->level == 1)) 
  //  wrapper = 1;  

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  if (hls->target == CATAPULT_HW)
    p = print_module_core_headers_catapult(p, prog, module, hls, -1, boundary, 0, 1);  
  
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  //if (!prog->scop->options->autosa->use_cplusplus_template) {
  p = print_module_iterators(p, hls->kernel_c, module);  
  //}  
  if (prog->scop->options->autosa->block_sparse) {
    for (int i = 0; i < module->n_io_group; i++) {
      struct autosa_array_ref_group *group = module->io_groups[i];
      if (group->local_array->array_type == AUTOSA_EXT_ARRAY) {      
        int n_lane = get_io_group_n_lane(module, NULL, group);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, group->array->name);
        if (group->local_array->is_sparse)
          p = isl_printer_print_str(p, "_s_t");
        else
          p = isl_printer_print_str(p, "_t");      
        p = isl_printer_print_int(p, n_lane);
        p = isl_printer_print_str(p, " fifo_data_");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    }
  }  
  if (module->type == PE_MODULE)
    p = print_module_vars_catapult(p, module, -1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);  

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);  
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_catapult, &hw_data);

  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);
  p = isl_printer_indent(p, -2);
  
  if (hls->target == CATAPULT_HW)
    p = print_module_fields_catapult(p, prog, module, hls, -1, boundary, 0, 1);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

/* Print the inter_trans module.
 */
static __isl_give isl_printer *autosa_print_inter_trans_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (boundary) {
    if (!module->boundary_inter_tree)
      return p;
  } else {
    if (!module->inter_tree)
      return p;
  }  

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);
  
  p = print_module_core_headers_catapult(p, prog, module, hls, 1, boundary, 0, 1);
    
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }  
  p = print_module_vars_catapult(p, module, 1); 
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);  
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_catapult, &hw_data);  
  
  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_indent(p, -2);
  
  p = print_module_fields_catapult(p, prog, module, hls, 1, boundary, 0, 1);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;  
}

/* Print the intra_trans module. 
 */
static __isl_give isl_printer *autosa_print_intra_trans_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (!module->intra_tree)
    return p;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = print_module_core_headers_catapult(p, prog, module, hls, 0, boundary, 0, 1);
  
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_module_vars_catapult(p, module, 1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  //if (module->double_buffer)
  //{
  //  p = isl_printer_start_line(p);
  //  p = isl_printer_print_str(p, "if (!intra_trans_en) return;");
  //  p = isl_printer_end_line(p);
  //  p = isl_printer_end_line(p);
  //}
  /* For local reduce, print the buffer initialization. */  
  for (int i = 0; i < module->n_var; i++) {
    if (module->var[i].init_required) {
      p = autosa_print_var_initialization(p, &module->var[i], hls->target);
    }
  }
  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);  
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                        &print_for_catapult, &hw_data);  
    
  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  p = print_module_fields_catapult(p, prog, module, hls, 0, boundary, 0, 1);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;  
}

static __isl_give isl_printer *print_local_array_struct(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module,
  struct autosa_kernel_var *var)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "struct ");
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_print_str(p, "_");
  p = isl_printer_print_str(p, var->name);
  p = isl_printer_print_str(p, " {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  if (var->array->local_array->is_sparse && module->type != PE_MODULE) {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, var->n_lane);
  } else {    
    p = isl_printer_print_str(p, var->array->name);    
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, var->n_lane);    
  }
  p = isl_printer_print_str(p, " data");
  for (int i = 0; i < isl_vec_size(var->size); i++) {
    isl_val *v;

    p = isl_printer_print_str(p, "[");
    v = isl_vec_get_element_val(var->size, i);
    p = isl_printer_print_val(p, v);
    isl_val_free(v);
    p = isl_printer_print_str(p, "]");    
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "};");
  p = isl_printer_end_line(p);

  return p;
}

static __isl_give isl_printer *autosa_print_host_code(__isl_take isl_printer *p,
                                                      struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                                      struct autosa_hw_module **modules, int n_modules,
                                                      struct autosa_hw_top_module *top,
                                                      struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                      struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(tree);
  struct print_host_user_data data = {hls, prog, top};
  struct print_hw_module_data hw_data = {hls, prog, NULL};
  isl_printer *p_module;

  /* Print the data pack types in the program. */
  print_data_types_catapult(top, hls);

  /* Print the macros for sparse data structure */
  if (prog->scop->options->autosa->block_sparse) {
    print_sparse_macros(top->kernel, hls);
  }

  /* Print the helper functions in the program. */
  print_drain_merge_funcs(top->kernel, drain_merge_funcs, n_drain_merge_funcs, hls);

  /* Print the host data serialization function. */
  print_host_serialize_funcs(top->kernel, modules, n_modules, hls);

  /* Print the default AST. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_host_user_catapult, &data);

  /* Print the macros definitions in the program. */
  p = autosa_print_macros(p, tree);
  p = isl_ast_node_print(tree, p, print_options);

  /* Print the hw module ASTs. */
  p_module = isl_printer_to_file(ctx, hls->kernel_c);
  p_module = isl_printer_set_output_format(p_module, ISL_FORMAT_C);

  /* Print the local buffer definition */
  p_module = isl_printer_end_line(p_module);
  for (int i = 0; i < n_modules; i++) {
    if (modules[i]->type == PE_MODULE)
      continue;
    if (modules[i]->n_var > 0) {
      for (int j = 0; j < modules[i]->n_var; j++)
        p_module = print_local_array_struct(p_module, modules[i], &modules[i]->var[j]);
        p_module = isl_printer_end_line(p_module);
    }
  }
  p_module = print_str_new_line(p_module, "#include <mc_scverify.h>");
  p_module = isl_printer_end_line(p_module);

  for (int i = 0; i < n_modules; i++)
  {
    if (modules[i]->is_filter && modules[i]->is_buffer)
    {
      /* Print out the definitions for inter_trans and intra_trans function calls. */
      /* Intra transfer function */
      p_module = autosa_print_intra_trans_module(p_module, modules[i], prog, hls, 0); // todo
 
      /* Inter transfer function */
      p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 0); // todo
      if (modules[i]->boundary)
        p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 1); // todo
    }

    p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 0);
 
    if (modules[i]->boundary)
    {
      /* Print out the definitions for boundary trans function calls. */
      p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 1);
    }      
  }
  isl_printer_free(p_module);

  return p;
}

static __isl_give isl_printer *print_top_module_headers_catapult(
  __isl_take isl_printer *p,
  struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls)
{
  struct autosa_kernel *kernel = top->kernel;

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"#pragma hls_design top\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");  
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"class kernel0 {\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_indent(p, 2);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");  
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"public:\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_indent(p, 2);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");  
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"kernel0() {}\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");  
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"#pragma hls_design interface\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"void CCS_BLOCK(run)(");
  p = print_kernel_arguments(p, prog, top->kernel, 1, hls); // todo
  p = isl_printer_print_str(p, ")\");");
  p = isl_printer_end_line(p);
  
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"{\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  return p;
}

static __isl_give isl_printer *print_top_module_call_stmt(
  __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
    case AUTOSA_KERNEL_STMT_MODULE_CALL:
      return autosa_kernel_print_module_call(p, stmt, data->prog, data->hls->target);
  }

  return p;  
}

static __isl_give isl_printer *print_top_module_call_inst(
  __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
    case AUTOSA_KERNEL_STMT_MODULE_CALL:
      return autosa_kernel_print_module_call_inst(p, stmt, data->prog, data->hls->target);
  }

  return p;    
}

static char *extract_fifo_name_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static char *extract_fifo_width_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    loc++;
  }

  loc++;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static __isl_give isl_printer *print_top_module_fifo_stmt(__isl_take isl_printer *p,
                                                          __isl_take isl_ast_print_options *print_options,
                                                          __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_FIFO_DECL:
    return autosa_kernel_print_fifo_decl(p, stmt, data->prog, data->hls);
  }

  return p;
}

/* This function prints the code that prints out the top function that 
 * calls the hardware modules and declares the fifos.
 */
static void print_top_gen_host_code(
  struct autosa_prog *prog, __isl_keep isl_ast_node *node,
  struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  isl_printer *p;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;
  struct print_hw_module_data hw_data = {hls, prog, NULL};

  /* Print the top module ASTs. */
  p = isl_printer_to_file(ctx, hls->top_gen_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);

  print_top_gen_headers(prog, top, hls);
  fprintf(hls->top_gen_c, " {\n");
  p = isl_printer_indent(p, 2);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *fd = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/resource_est/design_info.dat\", \"w\");");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int fifo_cnt;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx *ctx = isl_ctx_alloc();");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer *p = isl_printer_to_file(ctx, f);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  if (hls->target == CATAPULT_HW)
    p = print_top_module_headers_catapult(p, prog, top, hls);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, 2);");
  p = isl_printer_end_line(p);

  int n_module_names = 0;
  char **module_names = NULL;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    /* Generate module call counter. */
    struct autosa_hw_module *module = top->hw_modules[i];
    char *module_name;

    if (module->is_filter && module->is_buffer)
    {
      module_name = concat(ctx, module->name, "intra_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      module_name = concat(ctx, module->name, "inter_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      if (module->boundary)
      {
        module_name = concat(ctx, module->name, "inter_trans_boundary");

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    module_name = strdup(module->name);

    n_module_names++;
    module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
    module_names[n_module_names - 1] = module_name;

    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "boundary");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }

    if (module->n_pe_dummy_modules > 0)
    {
      for (int j = 0; j < module->n_pe_dummy_modules; j++)
      {
        struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[j];
        struct autosa_array_ref_group *group = dummy_module->io_group;
        isl_printer *p_str = isl_printer_to_str(ctx);
        p_str = autosa_array_ref_group_print_prefix(group, p_str);
        p_str = isl_printer_print_str(p_str, "_PE_dummy");
        p_str = isl_printer_print_str(p_str, dummy_module->in? "_in" : "_out");
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    if (module->is_serialized) { 
      if (module->boundary)      
        module_name = concat(ctx, module->name, "boundary_serialize");
      else
        module_name = concat(ctx, module->name, "serialize");
      
      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }
  }
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt = 0;");
    p = isl_printer_end_line(p);
  }

  /* Print module calls. */
  for (int i = 0; i < top->n_module_calls; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_call_stmt, &hw_data);

    p = isl_ast_node_print(top->module_call_wrapped_trees[i],
                           p, print_options);
  }

  /* module:module_name:module_cnt. */
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"module:");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, ":\%d\\n\", ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt);");
    p = isl_printer_end_line(p);
  }
  p = isl_printer_end_line(p);

  for (int i = 0; i < n_module_names; i++)
  {
    free(module_names[i]);
  }
  free(module_names);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");  

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  /* Print the private fields */
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"private:\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_indent(p, 2);");

  /* Print the function calls */
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"/* Module Declaration */\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  for (int i = 0; i < top->n_module_calls; i++) {
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_call_inst, &hw_data);
    p = isl_ast_node_print(top->module_call_wrapped_trees[i],
                           p, print_options);
  }
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"/* Module Declaration */\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  /* Print the fifo decls */
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  
  /* Print the serialize fifos if existing. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    struct autosa_array_ref_group *group = module->io_groups[0];
    if (module->is_serialized) {
      /* Generate fifo decl counter. */
      char *fifo_name;
      int fifo_w;  // bytes
      fifo_w = module->data_pack_inter * group->array->size;
      isl_printer *p_str;
      p_str = isl_printer_to_str(ctx);
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      p_str = isl_printer_print_str(p_str, "_");
      p_str = isl_printer_print_str(p_str, module->name);
      p_str = isl_printer_print_str(p_str, "_serialize");
      fifo_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      p = print_str_new_line(p, "fifo_cnt = 1;");
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* ");
      p = isl_printer_print_str(p, module->name);
      p = isl_printer_print_str(p, "_serialize fifo */ ");      
      p = print_fifo_type_catapult(p, group, module->data_pack_inter);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, fifo_name);      
      p = isl_printer_print_str(p, ";\");");
      p = isl_printer_end_line(p);
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");      

      /* fifo:fifo_name:fifo_cnt:fifo_width */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, ":\%d:");
      p = isl_printer_print_int(p, fifo_w);
      p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
      p = isl_printer_end_line(p);

      p = isl_printer_end_line(p);      
      free(fifo_name);
    }
  }

  for (int i = 0; i < top->n_fifo_decls; i++) {
    /* Generate fifo decl counter. */
    char *fifo_decl_name = top->fifo_decl_names[i];
    char *fifo_name = extract_fifo_name_from_fifo_decl_name(ctx, fifo_decl_name);
    char *fifo_w = extract_fifo_width_from_fifo_decl_name(ctx, fifo_decl_name);
    p = print_str_new_line(p, "fifo_cnt = 0;");

    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_fifo_stmt, &hw_data); 

    p = isl_ast_node_print(top->fifo_decl_wrapped_trees[i],
                           p, print_options);

    /* fifo:fifo_name:fifo_cnt:fifo_width */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ":\%d:");
    p = isl_printer_print_str(p, fifo_w);
    p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);

    free(fifo_name);
    free(fifo_w);
  }

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");    
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");  
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = print_str_new_line(p, "p = isl_printer_indent(p, -2);");
  p = print_str_new_line(p, "p = isl_printer_indent(p, -2);");

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"};\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  //if (hls->target == XILINX_HW)
  //{
  //  if (!hls->hls)
  //  {
  //    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  //    p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
  //    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  //  }
  //}

  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fclose(fd);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer_free(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx_free(ctx);");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "}");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* For internal testing only. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int main()");
  p = isl_printer_end_line(p);

  p = ppcg_start_block(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *f = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/src/top.cpp\", \"w\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "top_generate(f);");
  p = isl_printer_end_line(p);

  p = ppcg_end_block(p);
  p = isl_printer_free(p);

  return;  
}

/* This function prints the tcl file for the catapult HLS project. */
static void print_tcl_code(
  struct autosa_prog *prog, 
  struct autosa_hw_module **modules,
  int n_modules,
  struct hls_info *hls)
{
  isl_ctx *ctx = prog->ctx;
  isl_printer *p;
  
  p = isl_printer_to_file(ctx, hls->tcl);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);

  p = print_str_new_line(p, "solution new -state initial");
  p = print_str_new_line(p, "solution options defaults");
  p = print_str_new_line(p, "solution options set /Input/CppStandard c++11");
  p = print_str_new_line(p, "solution options set /Output/GenerateCycleNetlist false");
  p = print_str_new_line(p, "solution options set /Flows/SCVerify/USE_CCS_BLOCK true");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "solution file add ./");
  p = isl_printer_print_str(p, hls->kernel_prefix);
  p = isl_printer_print_str(p, "_kernel.h -type CHEADER");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "solution file add ./");
  p = isl_printer_print_str(p, hls->kernel_prefix);
  p = isl_printer_print_str(p, "_kernel_hw.h -type CHEADER");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "solution file add ./");
  p = isl_printer_print_str(p, hls->kernel_prefix);
  p = isl_printer_print_str(p, ".h -type CHEADER");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "solution file add ./");
  p = isl_printer_print_str(p, hls->kernel_prefix);
  p = isl_printer_print_str(p, "_host.cpp -type C++");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "directive set -PIPELINE_RAMP_UP true");
  p = print_str_new_line(p, "directive set -PROTOTYPING_ENGINE oasys");
  p = print_str_new_line(p, "directive set -CLUSTER_TYPE combinational");
  p = print_str_new_line(p, "directive set -CLUSTER_FAST_MODE false");
  p = print_str_new_line(p, "directive set -CLUSTER_RTL_SYN false");
  p = print_str_new_line(p, "directive set -CLUSTER_OPT_CONSTANT_INPUTS true");
  p = print_str_new_line(p, "directive set -CLUSTER_ADDTREE_IN_COUNT_THRESHOLD 0");
  p = print_str_new_line(p, "directive set -CLUSTER_ADDTREE_IN_WIDTH_THRESHOLD 0");
  p = print_str_new_line(p, "directive set -ROM_THRESHOLD 64");
  p = print_str_new_line(p, "directive set -PROTOTYPE_ROM true");
  p = print_str_new_line(p, "directive set -CHARACTERIZE_ROM false");
  p = print_str_new_line(p, "directive set -OPT_CONST_MULTS use_library");
  p = print_str_new_line(p, "directive set -CLOCK_OVERHEAD 20.000000");
  p = print_str_new_line(p, "directive set -RESET_CLEARS_ALL_REGS use_library");
  p = print_str_new_line(p, "directive set -START_FLAG {}");
  p = print_str_new_line(p, "directive set -READY_FLAG {}");
  p = print_str_new_line(p, "directive set -DONE_FLAG {}");
  p = print_str_new_line(p, "directive set -TRANSACTION_DONE_SIGNAL true");
  p = print_str_new_line(p, "directive set -STALL_FLAG false");
  p = print_str_new_line(p, "directive set -IDLE_SIGNAL {}");
  p = print_str_new_line(p, "directive set -REGISTER_IDLE_SIGNAL false");
  p = print_str_new_line(p, "directive set -ARRAY_SIZE 1024");
  p = print_str_new_line(p, "directive set -CHAN_IO_PROTOCOL use_library");
  p = print_str_new_line(p, "directive set -IO_MODE super");
  p = print_str_new_line(p, "directive set -UNROLL no");
  p = print_str_new_line(p, "directive set -REALLOC true");
  p = print_str_new_line(p, "directive set -MUXPATH true");
  p = print_str_new_line(p, "directive set -TIMING_CHECKS true");
  p = print_str_new_line(p, "directive set -ASSIGN_OVERHEAD 0");
  p = print_str_new_line(p, "directive set -REGISTER_SHARING_LIMIT 0");
  p = print_str_new_line(p, "directive set -REGISTER_SHARING_MAX_WIDTH_DIFFERENCE 8");
  p = print_str_new_line(p, "directive set -SAFE_FSM false");
  p = print_str_new_line(p, "directive set -NO_X_ASSIGNMENTS true");
  p = print_str_new_line(p, "directive set -REG_MAX_FANOUT 0");
  p = print_str_new_line(p, "directive set -FSM_BINARY_ENCODING_THRESHOLD 64");
  p = print_str_new_line(p, "directive set -FSM_ENCODING none");
  p = print_str_new_line(p, "directive set -LOGIC_OPT false");
  p = print_str_new_line(p, "directive set -MEM_MAP_THRESHOLD 32");
  p = print_str_new_line(p, "directive set -REGISTER_THRESHOLD 256");
  p = print_str_new_line(p, "directive set -MERGEABLE true");
  p = print_str_new_line(p, "directive set -SPECULATE true");
  p = print_str_new_line(p, "directive set -DESIGN_GOAL area");

  p = print_str_new_line(p, "go new");
  p = print_str_new_line(p, "solution library add mgc_Xilinx-VIRTEX-uplus-2LV_beh -- -rtlsyntool Vivado -manufacturer Xilinx -family VIRTEX-uplus -speed -2LV -part xcvu11p-flga2577-2LV-e");
  p = print_str_new_line(p, "solution library add Xilinx_RAMS");
  p = print_str_new_line(p, "solution library add Xilinx_ROMS");
  p = print_str_new_line(p, "solution library add amba");
  p = print_str_new_line(p, "solution library add ccs_fpga_hic");
  p = print_str_new_line(p, "solution library add Xilinx_FIFO");

  p = print_str_new_line(p, "go libraries");
  p = print_str_new_line(p, "directive set -CLOCKS {clk {-CLOCK_PERIOD 5.0 -CLOCK_EDGE rising -CLOCK_UNCERTAINTY 0.0 -CLOCK_HIGH_TIME 2.5 -RESET_SYNC_NAME rst -RESET_ASYNC_NAME arst_n -RESET_KIND sync -RESET_SYNC_ACTIVE high -RESET_ASYNC_ACTIVE low -ENABLE_ACTIVE high}}");

  p = print_str_new_line(p, "go assembly");
  p = print_str_new_line(p, "directive set -FIFO_DEPTH 1");

  /* Set all modules with identifiers to direct input. */
  const char *dims[] = {"idx", "idy", "idz"};
  for (int i = 0; i < n_modules; i++) {
    int n = isl_id_list_n_id(modules[i]->inst_ids);
    if (modules[i]->is_filter && modules[i]->is_buffer) {
      /* Intra transfer function */      
      if (n > 0) {
        for (int j = 0; j < n; j++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "directive set /kernel0/");
          p = isl_printer_print_str(p, modules[i]->name);
          p = isl_printer_print_str(p, "_intra_trans/");
          p = isl_printer_print_str(p, dims[j]);
          p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
          p = isl_printer_end_line(p);
        }
      }

      /* Inter transfer function */
      if (n > 0) {
        for (int j = 0; j < n; j++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "directive set /kernel0/");
          p = isl_printer_print_str(p, modules[i]->name);
          p = isl_printer_print_str(p, "_inter_trans/");
          p = isl_printer_print_str(p, dims[j]);
          p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
          p = isl_printer_end_line(p);
        }
      }

      if (modules[i]->boundary) {
        if (n > 0) {
          for (int j = 0; j < n; j++) {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "directive set /kernel0/");
            p = isl_printer_print_str(p, modules[i]->name);
            p = isl_printer_print_str(p, "_inter_trans_boundary/");
            p = isl_printer_print_str(p, dims[j]);
            p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
            p = isl_printer_end_line(p);
          }
        }
      }
    }

    /* Default module */
    if (n > 0) {
      for (int j = 0; j < n; j++) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "directive set /kernel0/");
        p = isl_printer_print_str(p, modules[i]->name);
        p = isl_printer_print_str(p, "/");
        p = isl_printer_print_str(p, dims[j]);
        p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
        p = isl_printer_end_line(p);
      }

      /* Serialize */
      if (modules[i]->to_mem && modules[i]->options->autosa->host_serialize) {
        for (int j = 0; j < n; j++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "directive set /kernel0/");
          p = isl_printer_print_str(p, modules[i]->name);
          p = isl_printer_print_str(p, "_serialize/");
          p = isl_printer_print_str(p, dims[j]);
          p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
          p = isl_printer_end_line(p);
        }
      }

      if (modules[i]->boundary) {
        for (int j = 0; j < n; j++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "directive set /kernel0/");
          p = isl_printer_print_str(p, modules[i]->name);
          p = isl_printer_print_str(p, "_boundary/");
          p = isl_printer_print_str(p, dims[j]);
          p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
          p = isl_printer_end_line(p);
        }

        /* Serialize */
        if (modules[i]->to_mem && modules[i]->options->autosa->host_serialize) {
          for (int j = 0; j < n; j++) {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "directive set /kernel0/");
            p = isl_printer_print_str(p, modules[i]->name);
            p = isl_printer_print_str(p, "_boundary_serialize/");
            p = isl_printer_print_str(p, dims[j]);
            p = isl_printer_print_str(p, ":rsc -MAP_TO_MODULE {[DirectInput]}");
            p = isl_printer_end_line(p);
          }
        } 
      }
    }
  }

  /* Set local buffer properties. */
  for (int i = 0; i < n_modules; i++) {
    if (modules[i]->type == PE_MODULE)
      continue;
    for (int j = 0; j < modules[i]->n_var; j++) {
      struct autosa_kernel_var *var;
      var = (struct autosa_kernel_var *)&modules[i]->var[j];
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "directive set /kernel0/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "directive set /kernel0/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, var->name);
      if (modules[i]->double_buffer)
        p = isl_printer_print_str(p, "_inst:cns -STAGE_REPLICATION 2");
      else
        p = isl_printer_print_str(p, "_inst:cns -STAGE_REPLICATION 1");
      p = isl_printer_end_line(p);

      /* word width */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "directive set /kernel0/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "/");
      p = isl_printer_print_str(p, modules[i]->name);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "_inst -WORD_WIDTH ");
      p = isl_printer_print_int(p, var->array->size * 8 * var->n_lane);
      p = isl_printer_end_line(p);

      if (modules[i]->boundary) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "directive set /kernel0/");
        p = isl_printer_print_str(p, modules[i]->name);
        p = isl_printer_print_str(p, "_boundary");
        p = isl_printer_print_str(p, "/");
        p = isl_printer_print_str(p, modules[i]->name);
        //p = isl_printer_print_str(p, "_boundary_");
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_str(p, var->name);
        p = isl_printer_print_str(p, "_inst:cns -MAP_TO_MODULE Xilinx_RAMS.BLOCK_1R1W_RBW_DUAL");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "directive set /kernel0/");
        p = isl_printer_print_str(p, modules[i]->name);
        p = isl_printer_print_str(p, "_boundary");
        p = isl_printer_print_str(p, "/");
        p = isl_printer_print_str(p, modules[i]->name);
        //p = isl_printer_print_str(p, "_boundary_");
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_str(p, var->name);
        if (modules[i]->double_buffer)
          p = isl_printer_print_str(p, "_inst:cns -STAGE_REPLICATION 2");
        else
          p = isl_printer_print_str(p, "_inst:cns -STAGE_REPLICATION 1");
        p = isl_printer_end_line(p);

        /* word width */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "directive set /kernel0/");
        p = isl_printer_print_str(p, modules[i]->name);
        p = isl_printer_print_str(p, "_boundary");
        p = isl_printer_print_str(p, "/");
        p = isl_printer_print_str(p, modules[i]->name);
        //p = isl_printer_print_str(p, "_boundary_");
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_str(p, var->name);
        p = isl_printer_print_str(p, "_inst -WORD_WIDTH ");
        p = isl_printer_print_int(p, var->array->size * 8 * var->n_lane);
        p = isl_printer_end_line(p);
      }
    }
  }

  p = print_str_new_line(p, "go architect");
  p = print_str_new_line(p, "// Insert directives for dependence if necessary");
  p = print_str_new_line(p, "// Example: directive set /kernel0/PE/run/for:read_mem(local_C:rsc.@) -IGNORE_DEPENDENCY_FROM {for:write_mem(local_C:rsc.@) for:write_mem(local_C:rsc.@)}");
  
  p = print_str_new_line(p, "go allocate");
  p = print_str_new_line(p, "go extract");

  p = isl_printer_free(p);

  return;
}

/* Given a autosa_prog "prog" and the corresponding tranformed AST
 * "tree", print the entire OpenCL/HLS code to "p".
 * "types" collects the types for which a definition has already been
 * printed.
 */
static __isl_give isl_printer *print_hw(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
    struct autosa_hw_module **modules, int n_modules,
    struct autosa_hw_top_module *top_module,
    struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
    struct autosa_types *types, void *user)
{
  struct hls_info *hls = (struct hls_info *)user;
  isl_printer *p_tmp;

  p_tmp = isl_printer_to_file(isl_printer_get_ctx(p), hls->kernel_c);
  p_tmp = isl_printer_set_output_format(p_tmp, ISL_FORMAT_C);
  p_tmp = autosa_print_types(p_tmp, types, prog);
  p_tmp = isl_printer_free(p_tmp);  

  /* Print OpenCL host and kernel function. */
  p = autosa_print_host_code(p, prog, tree, modules, n_modules, top_module,
                             drain_merge_funcs, n_drain_merge_funcs, hls);
  /* Print seperate top module code generation function. */
  print_top_gen_host_code(prog, tree, top_module, hls);
  /* Print the separate TCL file. */
  print_tcl_code(prog, modules, n_modules, hls);

  return p;
}

/* Generate systolic arrays using Catapult HLS C.
 */
int generate_autosa_catapult_hls_c(isl_ctx *ctx, struct ppcg_options *options,
                                   const char *input)
{
  struct hls_info hls;
  int r;

  hls.target = CATAPULT_HW;  
  hls.hls = 1;
  hls.ctx = ctx;
  hls.output_dir = options->autosa->output_dir;
  hls.hcl = options->autosa->hcl;
  hls_open_files(&hls, input);

  r = generate_sa(ctx, input, hls.host_c, options, &print_hw, &hls);

  hls_close_files(&hls);

  return r;
}

================================================
FILE: src/autosa_catapult_hls_c.h
================================================
#ifndef _AUTOSA_CATAPULT_HLS_C_H
#define _AUTOSA_CATAPULT_HLS_C_H

#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

int generate_autosa_catapult_hls_c(isl_ctx *ctx, struct ppcg_options *options,
																	 const char *input);

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: src/autosa_codegen.cpp
================================================
#include <isl/aff.h>

#include <barvinok/isl.h>

#include "autosa_codegen.h"
#include "autosa_utils.h"
#include "autosa_print.h"
#include "autosa_schedule_tree.h"
#include "autosa_comm.h"

/* Generate the I/O module name.
 * [io_group_name]_IO_L[X]_in/out
 */
static char *generate_io_module_name(isl_ctx *ctx,
                                     struct autosa_array_ref_group *group, int level, int read)
{
  isl_printer *p;

  p = isl_printer_to_str(ctx);
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_IO_L");
  p = isl_printer_print_int(p, level);
  if (read)
    p = isl_printer_print_str(p, "_in");
  else
    p = isl_printer_print_str(p, "_out");

  char *str = isl_printer_get_str(p);
  isl_printer_free(p);

  return str;
}

/* Return an isl_multi_aff, with as elements the parameters in "space"
 * that have the names specified by the elements in "names".
 * If (some of) these parameters do not already appear in "space",
 * then they are added first.
 */
static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space,
                                                  __isl_keep isl_id_list *names)
{
  int i, n;
  isl_local_space *ls;
  isl_multi_aff *ma;

  if (!names)
    space = isl_space_free(space);

  n = isl_id_list_n_id(names);
  for (i = 0; i < n; ++i)
  {
    int pos;
    isl_id *id;

    id = isl_id_list_get_id(names, i);
    pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
    if (pos >= 0)
    {
      isl_id_free(id);
      continue;
    }
    pos = isl_space_dim(space, isl_dim_param);
    space = isl_space_add_dims(space, isl_dim_param, 1);
    space = isl_space_set_dim_id(space, isl_dim_param, pos, id);
  }
  ma = isl_multi_aff_zero(isl_space_copy(space));
  ls = isl_local_space_from_space(isl_space_domain(space));
  for (i = 0; i < n; ++i)
  {
    int pos;
    isl_id *id;
    isl_aff *aff;

    id = isl_id_list_get_id(names, i);
    pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
    isl_id_free(id);
    aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
                                isl_dim_param, pos);
    ma = isl_multi_aff_set_aff(ma, i, aff);
  }
  isl_local_space_free(ls);

  return ma;
}

/* Return constraints on the domain elements that are greater or equal 
 * to a sequence of parameters called "names", to the partial schedule of "node".
 * The number of members of the band node "node" should be smaller
 * than or equal to the number of elements in "names". 
 * If it is smaller, then the first elements of "names" are equated to zero.
 */
static __isl_give isl_union_set *set_schedule_ge(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names)
{
  int n, n_zero;
  isl_multi_union_pw_aff *mupa, *mupa2;
  isl_multi_aff *ma;
  isl_space *space;
  isl_union_set *domain;

  if (!node)
    return NULL;
  n = isl_id_list_n_id(names);
  if (n == 0)
    return isl_schedule_node_get_universe_domain(node);
  n_zero = n - isl_schedule_node_band_n_member(node);

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  space = isl_multi_union_pw_aff_get_space(mupa);
  space = isl_space_params(space);
  space = isl_space_set_from_params(space);
  space = isl_space_add_dims(space, isl_dim_set, n_zero);
  ma = isl_multi_aff_zero(space);
  domain = isl_schedule_node_get_universe_domain(node);
  /* Generate the mupa that is on the same domain of partial schedule, with
   * a function that maps to the n_zero dims to zero. */
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
      isl_union_set_copy(domain), ma);

  /* Generate the mupa with the n_zero dims as paramters and equal zero. */
  mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);
  space = isl_multi_union_pw_aff_get_space(mupa);
  ma = parameter_vector(space, names);
  /* Generate the mupa that is on the same domain of partial schedule, with
   * a function that maps the domain elements to the parameters. */
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
  mupa = isl_multi_union_pw_aff_sub(mupa, mupa2);

  return isl_multi_union_pw_aff_nonneg_union_set(mupa);
}

/* Return constraints on the domain elements that less or equal to a sequence of
 * parameters called "names", to the partial schedule of "node".
 * The number of members of the band node "node" should be smaller
 * than or equal to the number of elements in "names". 
 * If it is smaller, then the first elements of "names" are equated to zero.
 */
static __isl_give isl_union_set *set_schedule_le(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names)
{
  int n, n_zero;
  isl_multi_union_pw_aff *mupa, *mupa2;
  isl_multi_aff *ma;
  isl_space *space;
  isl_union_set *domain;

  if (!node)
    return NULL;
  n = isl_id_list_n_id(names);
  if (n == 0)
    return isl_schedule_node_get_universe_domain(node);
  n_zero = n - isl_schedule_node_band_n_member(node);

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  space = isl_multi_union_pw_aff_get_space(mupa);
  space = isl_space_params(space);
  space = isl_space_set_from_params(space);
  space = isl_space_add_dims(space, isl_dim_set, n_zero);
  ma = isl_multi_aff_zero(space);
  domain = isl_schedule_node_get_universe_domain(node);
  /* Generate the mupa that is on the same domain of partial schedule, with
   * a function that maps to the n_zero dims to zero. */
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
      isl_union_set_copy(domain), ma);

  /* Generate the mupa with the n_zero dims as paramters and equal zero. */
  mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);
  space = isl_multi_union_pw_aff_get_space(mupa);
  ma = parameter_vector(space, names);
  /* Generate the mupa that is on the same domain of partial schedule, with
   * a function that maps the domain elements to the parameters. */
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
  mupa = isl_multi_union_pw_aff_sub(mupa2, mupa);

  return isl_multi_union_pw_aff_nonneg_union_set(mupa);
}

/* Construct an isl_multi_val for use as tile sizes for tiling "node"
 * from the elements in "tile_size".
 */
static __isl_give isl_multi_val *construct_band_tiles_sizes(
    __isl_keep isl_schedule_node *node, int *tile_size)
{
  isl_space *space;

  if (!node)
    return NULL;

  space = isl_schedule_node_band_get_space(node);
  return ppcg_multi_val_from_int_list(space, tile_size);
}

/* Return constraints on the domain elements that equate a sequence of
 * parameters called "names", to the partial schedule
 * of "node" modulo the integers in "size".
 * The number of elements in the array "size" should be equal
 * to the number of elements in "names".
 * The number of members of the band node "node" should be smaller
 * than or equal to this number.  If it is smaller, then the first
 * elements of "names" are equated to zero.
 */
static __isl_give isl_union_set *set_schedule_modulo(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names,
    int *size)
{
  int n, n_zero;
  isl_space *space;
  isl_multi_aff *ma;
  isl_multi_union_pw_aff *mupa, *mupa2;
  isl_multi_val *mv;
  isl_union_set *domain;

  if (!node)
    return NULL;
  n = isl_id_list_n_id(names);
  if (n == 0)
    return isl_schedule_node_get_universe_domain(node);
  n_zero = n - isl_schedule_node_band_n_member(node);

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  mv = construct_band_tiles_sizes(node, size + n_zero);
  mupa = isl_multi_union_pw_aff_mod_multi_val(mupa, mv);
  space = isl_multi_union_pw_aff_get_space(mupa);
  space = isl_space_params(space);
  space = isl_space_set_from_params(space);
  space = isl_space_add_dims(space, isl_dim_set, n_zero);
  ma = isl_multi_aff_zero(space);

  domain = isl_schedule_node_get_universe_domain(node);
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
      isl_union_set_copy(domain), ma);
  mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);

  space = isl_multi_union_pw_aff_get_space(mupa);
  ma = parameter_vector(space, names);

  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
  mupa = isl_multi_union_pw_aff_sub(mupa, mupa2);

  return isl_multi_union_pw_aff_zero_union_set(mupa);
}

/* Generate two prefixes: fifo_prefix and buffer_prefix
 * fifo_prefix: fifo_A_0
 * buffer_prefix: local_A_0
 */
static void init_suffix(struct autosa_hw_module *module,
                        struct autosa_array_ref_group *group, char **fifo_suffix, char **buf_suffix)
{
  isl_ctx *ctx = isl_map_get_ctx(group->access);

  isl_printer *p = isl_printer_to_str(ctx);
  p = autosa_array_ref_group_print_fifo_name(group, p);
  *fifo_suffix = isl_printer_get_str(p);
  isl_printer_free(p);

  p = isl_printer_to_str(ctx);
  p = isl_printer_print_str(p, "local_");
  p = isl_printer_print_str(p, group->array->name);
  if ((group->group_type == AUTOSA_IO_GROUP && group->local_array->n_io_group > 1) ||
      (group->group_type == AUTOSA_PE_GROUP && group->local_array->n_pe_group > 1))
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_int(p, group->nr);
  }
  if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }  
  *buf_suffix = isl_printer_get_str(p);
  isl_printer_free(p);
}

///* Return constraints on the domain elements that equate the partial schedule
// * of "node" to the lower bound of partial schedule. 
// */
//static __isl_give isl_union_set *schedule_eq_lb(
//    __isl_keep isl_schedule_node *node)
//{
//  int n, n_zero;
//  isl_multi_union_pw_aff *mupa, *mupa2;
//  isl_multi_aff *ma;
//  isl_space *space;
//  isl_union_set *domain;
//  isl_union_map *umap;
//  isl_union_set *uset;
//  isl_schedule_node *node2;
//  isl_bool under_extension = isl_bool_false;
//
//  if (!node)
//    return NULL;
//
//  /* Test if it is under extension node */
//  node2 = isl_schedule_node_copy(node);
//  while (node2)
//  {
//    if (isl_schedule_node_get_type(node2) == isl_schedule_node_extension)
//    {
//      under_extension = isl_bool_true;
//      break;
//    }
//    if (isl_schedule_node_has_parent(node2))
//      node2 = isl_schedule_node_parent(node2);
//    else
//      break;
//  }
//  isl_schedule_node_free(node2);
//
//  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
//  if (!under_extension)
//  {
//    domain = isl_schedule_node_get_domain(node);
//    umap = isl_union_map_intersect_domain(umap, domain);
//  }
//  uset = isl_union_map_range(isl_union_map_copy(umap));
//  uset = isl_union_set_lexmin(uset);
//  umap = isl_union_map_reverse(umap);
//  uset = isl_union_set_apply(uset, umap);
//
//  return uset;
//}
static __isl_give isl_union_set *schedule_eq_lb(
  __isl_keep isl_schedule_node *node)
{
  isl_schedule_node *child;
  isl_union_map *prefix, *prefix_ge;
  int depth1, depth2;
  isl_set *prefix_range;
  isl_map *sched_identity, *ge;
  isl_union_set *domain;
  isl_schedule_node *node_tmp;
  isl_bool under_extension = isl_bool_false;

  if (!node)
    return NULL;

  /* Test if "node" is under extension node */
  node_tmp = isl_schedule_node_copy(node);
  while (node_tmp) {
    if (isl_schedule_node_get_type(node_tmp) == isl_schedule_node_extension) {
      under_extension = isl_bool_true;
      break;
    }
    if (isl_schedule_node_has_parent(node_tmp)) 
      node_tmp = isl_schedule_node_parent(node_tmp);
    else
      break;
  }
  isl_schedule_node_free(node_tmp);

  if (under_extension) {
//#ifdef _DEBUG    
//    printf("debug: under extension\n");
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif    
    /* Currently all the extension nodes are inserted with rectangular schedule domains.
     * Therefore, we will safely call a routine that handles the rectangular 
     * domains to get the lower bound. 
     */
    isl_union_map *umap;
    isl_union_set *uset;
    umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
    uset = isl_union_map_range(isl_union_map_copy(umap));
    uset = isl_union_set_lexmin(uset);
    umap = isl_union_map_reverse(umap);
    uset = isl_union_set_apply(uset, umap);

    return uset;
  }

  depth1 = isl_schedule_node_get_schedule_depth(node);
  child = isl_schedule_node_child(isl_schedule_node_copy(node), 0);
  depth2 = isl_schedule_node_get_schedule_depth(child);
  prefix = isl_schedule_node_get_prefix_schedule_relation(child);
  //DBGSCHDNODE(stdout, child, isl_schedule_node_get_ctx(child));
  //DBGUMAP(stdout, prefix, isl_schedule_node_get_ctx(child));
  isl_schedule_node_free(child);  
  //isl_union_set *tmp_uset = isl_union_map_range(isl_union_map_copy(prefix));
  //DBGUSET(stdout, tmp_uset, isl_union_set_get_ctx(tmp_uset));
  //prefix_range = isl_set_from_union_set(tmp_uset);
  prefix_range = isl_set_from_union_set(isl_union_map_range(isl_union_map_copy(prefix)));
  ge = isl_map_lex_ge(isl_set_get_space(prefix_range));
  /* Set the outer dims equal */
  for (int i = 0; i < depth1; i++) {
    ge = isl_map_equate(ge, isl_dim_in, i, isl_dim_out, i);
  }
  ge = isl_map_intersect_domain(ge, isl_set_copy(prefix_range));
  ge = isl_map_intersect_range(ge, prefix_range);
  prefix_ge = isl_union_map_apply_range(isl_union_map_copy(prefix), isl_union_map_from_map(ge));
  prefix_ge = isl_union_map_lexmin(prefix_ge);
  prefix = isl_union_map_intersect(prefix, prefix_ge);
  domain = isl_union_map_domain(prefix);

  return domain;
}

/* Return constraints on the domain elements that not equate the partial schedule
 * of "node" to the lower bound of partial schedule. 
 */
static __isl_give isl_union_set *schedule_neq_lb(
    __isl_keep isl_schedule_node *node)
{
  isl_union_set *uset, *domain;
  isl_union_map *umap;

  if (!node)
    return NULL;

  uset = schedule_eq_lb(node);
  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  domain = isl_union_map_domain(umap);
  uset = isl_union_set_subtract(domain, uset);

  return uset;
}

/* Return constraints on the domain elements that equate the partial schedule
 * of "node" to the upper bound of partial schedule. 
 */
static __isl_give isl_union_set *schedule_eq_ub(
    __isl_keep isl_schedule_node *node)
{
  /* Compute the prefix schedule, 
   * Build a relation that sets the demensions before the current band
   * equal, and the current dim le. 
   * Intersect the relation with the schedule range.
   * Apply the relation to the current prefix schedule range.
   * Compute the lexmax of the range.
   * Get the domain.
   */
  isl_schedule_node *child;
  isl_union_map *prefix, *prefix_le;
  int depth1, depth2;
  isl_set *prefix_range;
  isl_map *sched_identity, *le;
  isl_union_set *domain;

  if (!node)
    return NULL;

  depth1 = isl_schedule_node_get_schedule_depth(node);
  child = isl_schedule_node_child(isl_schedule_node_copy(node), 0);
  depth2 = isl_schedule_node_get_schedule_depth(child);
  prefix = isl_schedule_node_get_prefix_schedule_relation(child);
  isl_schedule_node_free(child);
  prefix_range = isl_set_from_union_set(isl_union_map_range(isl_union_map_copy(prefix)));   
  le = isl_map_lex_le(isl_set_get_space(prefix_range));  
  /* Set the outer dims equal */
  for (int i = 0; i < depth1; i++) {
    le = isl_map_equate(le, isl_dim_in, i, isl_dim_out, i);
  }
  le = isl_map_intersect_domain(le, isl_set_copy(prefix_range));
  le = isl_map_intersect_range(le, prefix_range);
  prefix_le = isl_union_map_apply_range(isl_union_map_copy(prefix), isl_union_map_from_map(le));
  prefix_le = isl_union_map_lexmax(prefix_le);
  prefix = isl_union_map_intersect(prefix, prefix_le);
  domain = isl_union_map_domain(prefix);

  return domain;
}

/* Return constraints on the domain elements that not equate the partial schedule
 * of "node" to the upper bound of partial schedule. 
 */
static __isl_give isl_union_set *schedule_neq_ub(
    __isl_keep isl_schedule_node *node)
{
  isl_union_set *uset, *domain, *sched_domain;
  isl_union_map *umap;

  if (!node)
    return NULL;

  uset = schedule_eq_ub(node);
  domain = isl_schedule_node_get_domain(node);
  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  umap = isl_union_map_intersect_domain(umap, domain);
  sched_domain = isl_union_map_domain(umap);
  uset = isl_union_set_subtract(sched_domain, uset);

  return uset;
}

/* Internal struct used for add_io_copies_stmt_acc. */
struct add_io_copies_stmt_acc_data
{
  struct autosa_kernel *kernel;
  struct autosa_array_ref_group *group;
  struct autosa_stmt_access *ref;
  struct autosa_array_tile *local_tile; /* Local buffer tile */
  int n_lane;
  int read;
  char *stmt_name;
  int insert_dependence;
  struct autosa_hw_module *module;
  int module_type; // 0 default 1 intra 1 inter
};

/* Create an IO statement. 
 * "io_group" is the current I/O group that is analyzed.
 * "local_tile" is the tile that the current IO stmt accesses.
 * "depth" is the schedule depth that the current stmt is inserted at.
 */
static __isl_give isl_multi_aff *autosa_create_io_access_stmt(
    isl_ctx *ctx,
    struct autosa_array_ref_group *local_group,
    struct autosa_array_ref_group *io_group,
    struct autosa_array_tile *tile,
    int depth,
    __isl_keep char *stmt_name)
{
  isl_space *space;
  isl_id *id;
  char buf[100];
  struct autosa_array_ref_group_pair *pair =
      (struct autosa_array_ref_group_pair *)malloc(
          sizeof(struct autosa_array_ref_group_pair));
  pair->local_group = local_group;
  pair->io_group = io_group;
  pair->local_tile = tile;
  pair->in_use = 0;  
  if (io_group->n_lane > 1 && io_group->local_array->array_type == AUTOSA_INT_ARRAY) {    
    pair->simd_depth = depth;
  } else {    
    pair->simd_depth = -1;
  }

  space = isl_space_copy(io_group->array->space);
  space = isl_space_from_range(space);
  space = isl_space_add_dims(space, isl_dim_in, depth);
  space = isl_space_wrap(space);
  space = isl_space_map_from_set(space);

  sprintf(buf, "%s", stmt_name);

  id = isl_id_alloc(ctx, buf, pair);
  id = isl_id_set_free_user(id, &free_group_pair);
  space = isl_space_set_tuple_id(space, isl_dim_in, id);

  return isl_multi_aff_identity(space);
}

/* Test if the array access "ref" is stride-0 or stride-1 under the current
 * schedule node.
 */
static isl_bool is_acc_stride_one_at_node(
    __isl_keep isl_schedule_node *node, struct autosa_stmt_access *ref)
{
  isl_union_set *domain;
  isl_union_map *prefix;
  isl_map *acc;
  isl_bool is_zero = isl_bool_false, is_one = isl_bool_false;
  
  prefix = isl_schedule_node_get_prefix_schedule_union_map(node);

  /* Scalar access */
  if (ref->n_index == 0)
    return isl_bool_true;

  /* Transform the domain of access function to scheduling domains. */
  acc = isl_map_copy(ref->access);
  acc = isl_map_from_union_map(
      isl_union_map_apply_domain(isl_union_map_from_map(acc), prefix));
  is_one = access_is_stride_one(acc, ref->n_index - 1);

  isl_map_free(acc);  
  return is_one;
}

/* Insert the copy statement at the statement level.
 */
static __isl_give isl_schedule_node *add_io_copies_stmt_acc_single(
    __isl_take isl_schedule_node *node, void *user)
{
  struct add_io_copies_stmt_acc_data *data =
      (struct add_io_copies_stmt_acc_data *)(user);
  struct autosa_array_ref_group *group = data->group;
  struct autosa_stmt_access *ref = data->ref;
  char *stmt_name = data->stmt_name;
  int read = data->read;
  isl_union_set *uset, *empty_filter, *domain;
  isl_set *set;
  isl_space *space;
  isl_id *id, *id2;
  isl_ctx *ctx;
  isl_union_map *access;
  int empty;
  struct autosa_array_tile *tile;
  isl_multi_aff *ma, *from_access;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_schedule_node *graft;
  int n_lane = data->n_lane;
  int is_simd;
  isl_id *hls_id;
  isl_bool stride_one;
  isl_bool insert_dependence = isl_bool_false;
  isl_bool under_extension;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;  

  /* Examine if the statement contains the access. */
  uset = isl_schedule_node_get_domain(node);
  if (isl_union_set_is_empty(uset)) {
    isl_union_set_free(uset);
    return node;
  }

  set = isl_set_from_union_set(isl_union_set_copy(uset));
  space = isl_set_get_space(set);
  isl_set_free(set);
  id = isl_space_get_tuple_id(space, isl_dim_set);
  isl_space_free(space);
  space = isl_map_get_space(ref->access);
  id2 = isl_space_get_tuple_id(space, isl_dim_in);
  empty_filter = isl_union_set_empty(isl_union_set_get_space(uset));
  isl_union_set_free(uset);
  isl_space_free(space);

  if (id != id2)
  {
    isl_id_free(id);
    isl_id_free(id2);
    node = isl_schedule_node_insert_filter(node, empty_filter);
    return node;
  }
  isl_id_free(id);
  isl_id_free(id2);
  ctx = isl_schedule_node_get_ctx(node);
  is_simd = is_node_under_simd(node);

  /* S -> [D -> A] */
  access = io_comm_access_ref(data->kernel, node, group, ref, read);
  //DBGUMAP(stdout, access, isl_union_map_get_ctx(access))

  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    isl_union_set_free(empty_filter);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return node;
  }

  /* Update the stmt_name. */
  if (data->insert_dependence)
  {
    isl_schedule_node *node2;

    node2 = isl_schedule_node_copy(node);
    if (n_lane >= 1 && is_simd)
    {
      //node2 = isl_schedule_node_parent(node);
      while (!is_marked(node2, "simd")) {
        node2 = isl_schedule_node_parent(node2);
      }
      node2 = isl_schedule_node_child(node2, 0);
    }
    /* Test if the access is stride one at the current loop. */
    stride_one = is_acc_stride_one_at_node(node2, ref);
    if (stride_one)
    {
      /* Test if the loop bound/n_lane > 1. 
       * If so, insert a hls_dep mark.
       * Only do this when there is a single access in the group.
       */
      int *ubs = NULL;
      isl_schedule_node *node_copy = isl_schedule_node_copy(node2);
      if (is_simd) {
        while (node_copy && isl_schedule_node_has_parent(node_copy)) {
          if (is_marked(node_copy, "simd")) 
            break;
          node_copy = isl_schedule_node_parent(node_copy);
        }
      }
      while (node_copy && isl_schedule_node_has_parent(node_copy))
      {
        if (isl_schedule_node_get_type(node_copy) == isl_schedule_node_band)
          break;
        node_copy = isl_schedule_node_parent(node_copy);
      }
      if (isl_schedule_node_get_type(node_copy) == isl_schedule_node_band)
      {
        int n = isl_schedule_node_band_n_member(node_copy);     
        ubs = extract_band_upper_bounds(node_copy);
        if (ubs[n - 1] / n_lane > 1)
        {
          insert_dependence = isl_bool_true;
          /* Update the stmt_name. */
          int coalesce_depth;
          int coalesce_bound;

          //coalesce_depth = isl_schedule_node_get_schedule_depth(node_copy) - 1;
          node_copy = isl_schedule_node_child(node_copy, 0);
          coalesce_depth = isl_schedule_node_get_schedule_depth(node_copy) - 1;
          coalesce_bound = ubs[n - 1] / n_lane;

          isl_printer *p_str = isl_printer_to_str(ctx);
          p_str = isl_printer_print_str(p_str, stmt_name);
          p_str = isl_printer_print_str(p_str, ".");
          p_str = isl_printer_print_int(p_str, coalesce_depth);
          p_str = isl_printer_print_str(p_str, ".");
          p_str = isl_printer_print_int(p_str, coalesce_bound);
          free(stmt_name);
          stmt_name = isl_printer_get_str(p_str);
          isl_printer_free(p_str);
        }
      }
      free(ubs);
      isl_schedule_node_free(node_copy);
    }
    isl_schedule_node_free(node2);
  }

  from_access = autosa_create_io_access_stmt(
      ctx, group, group, data->local_tile,
      isl_schedule_node_get_schedule_depth(node), stmt_name);
  free(stmt_name);

  /* Create a register tiling. */
  tile = create_register_tiling(node, group, ref);
  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

  /* [D -> A] */
  domain = isl_union_map_range(access);
  /* Only for read, we extend the access to a rectangular hull which helps to 
   * improve the memory coalescing. 
   */
  if (read && !autosa_array_is_scalar(group->array))
  {
    isl_map *map;
    isl_set *set;
    set = isl_map_domain(isl_map_from_union_map(isl_union_set_unwrap(domain)));
    map = group_tile_buffer(group, tile);
    map = isl_map_intersect_domain(map, set);
    domain = isl_union_set_from_set(isl_map_wrap(map));
  }

  /* read.fifoX[D -> A] */
  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  /* read.fifoX[D -> A] -> D */
  access = isl_union_set_wrapped_domain_map(domain);
  /* D -> read.fifoX[D -> A] */
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  /* If the current statement is under the SIMD loop, we will add a filter 
   * to only transfer the data at one loop since we will later insert a 
   * statement to handle the data transfer of the entire SIMD loop.
   */
  if (data->kernel->options->autosa->isl_sink) {
    if (n_lane >= 1 && is_simd)
    {
      /* The loop above is the SIMD loop.
       * Check the node is below the simd mark. 
       */
      int n_index;
      int tile_size[1];
      isl_id *id;
      isl_printer *p_str;
      isl_union_map *umap;
      isl_union_set *filter;
      /* Create a filter. */    
      node = isl_schedule_node_parent(node);
      if (data->read)
        filter = schedule_eq_lb(node);
      else
        filter = schedule_eq_ub(node);
      node = isl_schedule_node_insert_filter(node, filter);
      node = isl_schedule_node_child(node, 0);
      node = isl_schedule_node_child(node, 0);
    }
  }

  /* Insert a "pipeline" mark under the band node. */
  hls_id = isl_id_alloc(ctx, "hls_pipeline", NULL);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_mark(graft, hls_id);
  graft = isl_schedule_node_parent(graft);

  if (insert_dependence)
  {
    char *mark_name;
    isl_id *id;
    isl_printer *p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "hls_dependence.");
    p_str = autosa_array_ref_group_print_name(group, p_str);
    mark_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    id = isl_id_alloc(ctx, mark_name, NULL);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_mark(graft, id);
    free(mark_name);
  }

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);
  node = isl_schedule_node_insert_filter(node, empty_filter);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);  

  autosa_array_tile_free(tile);

  return node;
}

static __isl_give isl_schedule_node *modify_simd_loop(
  __isl_take isl_schedule_node *node, void *user)
{
  struct add_io_copies_stmt_acc_data *data =
      (struct add_io_copies_stmt_acc_data *)(user);
  if (data->n_lane >= 1 && is_marked(node, "simd")) {
    int n_index;
    int tile_size[1];
    isl_id *id;
    isl_printer *p_str;
    isl_union_map *umap;
    isl_union_set *filter;
    isl_union_set *domain;

    node = isl_schedule_node_child(node, 0);
    /* Test if the domain is empty. */
    domain = isl_schedule_node_get_domain(node);
    if (isl_union_set_is_empty(domain)) {
      isl_union_set_free(domain);
      node = isl_schedule_node_parent(node);
      return node;  
    }
    isl_union_set_free(domain);

    if (data->read)
      filter = schedule_eq_lb(node);
    else
      filter = schedule_eq_ub(node);
    node = isl_schedule_node_insert_filter(node, filter);
    node = isl_schedule_node_parent(node);
  }
  return node;
}

/* Add copies at the stmt level for each array reference in the "group" 
 * in the I/O modules.
 * 
 * "group" is an I/O group.
 * "read" denotes if copy-in or copy-out from/to the external memory.
 * "in" denotes the fifo direction.
 * "insert_dependence" determines if it is necessary to insert a hls dependence mark.
 */
__isl_give isl_schedule_node *add_io_copies_stmt_acc(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  __isl_take isl_schedule_node *node,
  struct autosa_array_tile *tile, /* local tile */
  int n_lane,
  int read,
  __isl_take char *stmt_name,
  int before,
  int insert_dependence,
  struct autosa_hw_module *module,
  int module_type)
{
  struct add_io_copies_stmt_acc_data data = {
      kernel, group, NULL, tile, n_lane, read, stmt_name,
      insert_dependence && group->n_ref == 1, module, module_type};

  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    data.ref = ref;
    //DBGMAP(stdout, ref->access, kernel->ctx)    
    if ((read && ref->read) || (!read && ref->write)) {
      node = isl_schedule_node_map_descendant_bottom_up(
          node, &add_io_copies_stmt_acc_single, &data);
    }
  }
//#ifndef ISL_SINK  
  /* Modify the SIMD loop.
   * If the current statement is under the SIMD loop, we will add a filter 
   * to only transfer the data at one loop since we will later insert a 
   * statement to handle the data transfer of the entire SIMD loop.   
   */
  if (!kernel->options->autosa->isl_sink) {
    node = isl_schedule_node_map_descendant_bottom_up(node, &modify_simd_loop, &data);
  }
//#endif  

  return node;
}

/* Insert the copy statement at the node level to transfer the entire tie.
 * If "is_buffer" is set, add a marker for dependence false. This is
 * only for Xilinx platform.
 */
static __isl_give isl_schedule_node *add_io_copies_stmt_tile(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  __isl_take isl_schedule_node *node,
  struct autosa_array_tile *local_tile, /* Local buffer */
  struct autosa_array_tile *tile,       /* The tile to be copied */  
  int n_lane,
  int read,
  __isl_take char *stmt_name,
  int before, int is_buffer,
  /* If it is proper to insert hls_pipeline for Xilinx platforms. */
  int insert_dependence,
  /* If needs to insert a access_serialize mark. */
  int insert_serialize,
  struct autosa_hw_module *module,
  int module_type,
  TPArrayTile *tuning_tile
) {
  isl_union_map *access = NULL;
  int empty;
  isl_multi_aff *from_access;
  isl_multi_aff *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_union_set *domain;
  isl_schedule_node *graft;
  int n;
  isl_id *id;
  isl_ctx *ctx = kernel->ctx;
  int coalesce_depth;
  int coalesce_bound;
  isl_val *coalesce_bound_val;  
  
  access = io_comm_access(kernel, node, group, read);

  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return node;
  }

  from_access = autosa_create_io_access_stmt(kernel->ctx, group, group,
                                             local_tile, isl_schedule_node_get_schedule_depth(node), stmt_name);

  ma = isl_multi_aff_copy(tile->tiling);  
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

  domain = isl_union_map_range(access);
  /* Restrain the buffer to the local tile size. */
  if (!autosa_array_is_scalar(group->array))
  {
    isl_map *map;
    isl_set *set;
    set = isl_map_domain(isl_map_from_union_map(isl_union_set_unwrap(domain)));
    map = group_tile_buffer(group, tile);
    map = isl_map_intersect_domain(map, set);
    domain = isl_union_set_from_set(isl_map_wrap(map));
  }

  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  access = isl_union_set_wrapped_domain_map(domain);
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  /* Split off the last dimension. */
  n = isl_schedule_node_band_n_member(graft);
  if (n > 1)
  {
    graft = isl_schedule_node_band_split(graft, n - 1);
    graft = isl_schedule_node_child(graft, 0);
  }

  /* Insert a coalesce mark indicating the loop below could be used for
   * memory coalescing.
   */
  id = isl_id_alloc(ctx, "access_coalesce", NULL);
  graft = isl_schedule_node_insert_mark(graft, id);
  graft = isl_schedule_node_child(graft, 0);

  if (insert_serialize) {
    id = isl_id_alloc(ctx, "access_serialize", NULL);
    graft = isl_schedule_node_insert_mark(graft, id);
    graft = isl_schedule_node_child(graft, 0);
  }

  if (kernel->options->autosa->tuning_method == 1) {
    /* Insert the buffer informaton */
    id = isl_id_alloc(ctx, "tuning_array_tile", tuning_tile);
    graft = isl_schedule_node_insert_mark(graft, id);
    graft = isl_schedule_node_child(graft, 0);
  }

  if (group->local_array->is_sparse) {
    n_lane *= (kernel->n_nzero * kernel->compress_ratio);
  }

  if (n_lane > 1) {
    /* Peform data packing. 
     * We will tile the last dimension by the factor of data packing.
     * Then we insert a filter to transfer data only once.
     */
    int tile_size[1];
    isl_id *id;
    isl_printer *p_str;
    isl_union_map *umap;
    isl_union_set *filter;
    int depth;

    /* Tile the last dimension. */
    tile_size[0] = n_lane;
    graft = autosa_tile_band(graft, tile_size);
    graft = isl_schedule_node_child(graft, 0);
    /* Create a filter. */
    filter = schedule_eq_lb(graft);
    graft = isl_schedule_node_insert_filter(graft, filter);
    /* Move to the tile loop */
    graft = isl_schedule_node_parent(graft);
  }
  free(stmt_name);
  /* Insert a "pipeline" mark inside the band node. */
  id = isl_id_alloc(ctx, "hls_pipeline", NULL);

  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_mark(graft, id);
  graft = isl_schedule_node_parent(graft);

  if (is_buffer && !read && insert_dependence)
  {
    // TODO: should not be inter_trans or intra_trans.
    // TODO: only add this pragma for io_transfer statement which requires data packing.
    /* Insert a "dependence" mark. 
     * This is not safe. Currently only insert the mark when there is at least 
     * one level of coalesce loop (coalesce_bound > 1) and
     * when data_pack does not equal to the nxt_data_pack. 
     */
    char *mark_name;
    isl_printer *p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "hls_dependence.");
    p_str = autosa_array_ref_group_print_name(group, p_str);
    mark_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    id = isl_id_alloc(ctx, mark_name, NULL);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_mark(graft, id);
    free(mark_name);
  }

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  //DBGSCHDNODE(stdout, graft, isl_schedule_node_get_ctx(graft));
  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));

  if (before)
  {
    node = isl_schedule_node_graft_before(node, graft);
  }
  else
  {
    node = isl_schedule_node_graft_after(node, graft);
  }

  return node;
}

/* Set all the module io dims equals to the module identifier above the io_level.
 * If the module is a filter, set the io dim greater or equal than the 
 * identifier at the io_level.
 * If the module is connect to pe, set the level 1 io dim equal to the lb/ub.
 * The node should point to the "array" mark.
 */
static __isl_give isl_schedule_node *add_io_ids_filter(
  __isl_take isl_schedule_node *node, 
  __isl_keep isl_id_list *io_ids,  
  int io_level, int n_io_ids, int is_filter, int to_pe, int read)
{
  isl_union_set *core;
  int io_id = 0;

  core = isl_union_set_universe(isl_schedule_node_get_domain(node));
  //for (int i = n_io_ids + 1; i >= io_level; i--) {
  for (int i = io_level + n_io_ids - 1; i >= io_level; i--) {
    node = autosa_tree_move_down_to_io_mark(node, core, i);
    node = isl_schedule_node_parent(node);
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
      isl_id *id;
      isl_id_list *ids;
      isl_union_set *uset;

      ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, io_id));
      if (io_id == n_io_ids - 1) {
        if (is_filter)
          uset = set_schedule_ge(node, ids);
        else
          uset = set_schedule_eq(node, ids);
      } else {
        uset = set_schedule_eq(node, ids);
      }
      io_id++;
      node = isl_schedule_node_insert_filter(node, uset);
      isl_id_list_free(ids);
    }
  }
  if (to_pe && io_level > 1)
  {
    /* Add filter to only send data to boundary PEs. */
    while (!isl_schedule_node_is_io_mark(node, 2)) {
      node = isl_schedule_node_child(node, 0);
    }
    node = isl_schedule_node_child(node, 0);
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
      isl_union_set *uset;

      if (read)
        uset = schedule_eq_lb(node);
      else
        uset = schedule_eq_ub(node);
      node = isl_schedule_node_insert_filter(node, uset);
      node = isl_schedule_node_child(node, 0);
    }
  }

  isl_union_set_free(core);

  return node; 
}

static __isl_give isl_printer *print_io_stmt_prefix(
  __isl_take isl_printer *p,
  int read, int dummy, int reduce,
  struct autosa_array_ref_group *group)
{
  /* io_type */
  p = isl_printer_print_str(p, read ? "in" : "out");
  if (dummy)
    p = isl_printer_print_str(p, "_dummy");
  if (reduce)
    p = isl_printer_print_str(p, "_reduce");
  
  /* fifo_name */
  p = isl_printer_print_str(p, ".");
  if (group->group_type != AUTOSA_PE_GROUP)
  {
    p = isl_printer_print_str(p, "fifo_");
  }
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }

  /* cur_data_pack */
  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, group->n_lane);

  /* next_data_pack */
  p = isl_printer_print_str(p, ".1");

  return p;
}

/* Print the io transfer statement prefix in the format of:
 * in/out_trans[_dram]/[_dram_serialize]/[_boundary]/[_reduce_[op]].
 * [in_fifo_name].[out_fifo_name].[is_buffer].[cur_pack_lane].[nxt_pack_lane].
 * [coalesce_depth].[coalesce_bound].[if_branch_depth]
 */
static __isl_give isl_printer *print_io_trans_stmt_prefix(
  __isl_take isl_printer *p, 
  int read, int to_mem, int serialize, int boundary, int reduce,
  char *reduce_op,
  int in_local, int out_local,
  int is_buffer,
  char *fifo_suffix, int n_lane) 
{
  /* io_trans_type */
  p = isl_printer_print_str(p, read ? "in_trans" : "out_trans");
  if (to_mem) {
    p = isl_printer_print_str(p, "_dram");
    if (serialize)
      p = isl_printer_print_str(p, "_serialize");
  }
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  if (reduce) {
    p = isl_printer_print_str(p, "_reduce_");
    p = isl_printer_print_str(p, reduce_op);
  }

  /* in_fifo_name */
  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_str(p, fifo_suffix);
  if (in_local)
    p = isl_printer_print_str(p, "_local");

  /* out_fifo_name */
  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_str(p, fifo_suffix);
  if (out_local)
    p = isl_printer_print_str(p, "_local");  

  /* is_buffer */
  p = isl_printer_print_str(p, is_buffer == 0 ? ".0" : ".1");

  /* cur_pack_lane */
  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, n_lane);  

  return p;
}

static __isl_give isl_printer *print_trans_stmt_coalesce(
    __isl_take isl_printer *p,
    __isl_keep isl_schedule_node *node,
    struct autosa_io_buffer *buf,
    int *coalesce_bound,
    int n_lane
    ) 
{
  int coalesce_depth;
  isl_val *coalesce_bound_val;
  
  coalesce_depth = isl_schedule_node_get_schedule_depth(node) + buf->tile->n - 1;
  /* If the host serialization is enabled, we extend the coalesce bound to the 
   * entire buffer. Otherwise, only the last dimension is considered.
   */    
  if (buf->serialize) {
    coalesce_bound_val = isl_val_copy(buf->tile->bound[buf->tile->n - 1].size);  
    for (int i = 0; i < buf->tile->n - 1; i++) {
      coalesce_bound_val = isl_val_mul(isl_val_copy(buf->tile->bound[i].size), 
                                       coalesce_bound_val);    
    }    
    if (buf->sparse) {
      *coalesce_bound = isl_val_get_num_si(coalesce_bound_val) / (n_lane * buf->vec_len);      
    } else {
      *coalesce_bound = isl_val_get_num_si(coalesce_bound_val) / n_lane;      
    }    
    isl_val_free(coalesce_bound_val);
  } else {
    coalesce_bound_val = buf->tile->bound[buf->tile->n - 1].size;  
    *coalesce_bound = isl_val_get_num_si(coalesce_bound_val) / n_lane;        
  }
  if (*coalesce_bound <= 1)
    coalesce_depth = -1;

  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, coalesce_depth);
  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, *coalesce_bound);

  return p;
}

static __isl_give isl_union_set *compute_io_group_access_domain(
  __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int read
){
  isl_union_map *group_access;
  isl_union_set *group_domain;
  isl_union_map *prefix;
  isl_schedule_node *node_tmp;

  node_tmp = isl_schedule_node_copy(node);
  node_tmp = autosa_tree_move_up_to_kernel(node_tmp);
  group_access = autosa_io_group_access_relation(group, kernel, read, !read);    
  if (kernel->array_part_w > 0) {
    /* Remove the local accesses below the array level. */
    node_tmp = autosa_tree_move_down_to_array(node_tmp, kernel->core);
    prefix = isl_schedule_node_get_prefix_schedule_relation(node_tmp);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              isl_union_pw_multi_aff_copy(kernel->contraction));
    if (group->local_array->array_type == AUTOSA_INT_ARRAY)
      group_access = remove_local_accesses_group_flow(kernel, group, group_access, prefix, read);  
    isl_union_map_free(prefix);
  }
  isl_schedule_node_free(node_tmp);

  group_domain = isl_union_map_domain(group_access);
  group_domain = isl_union_set_coalesce(group_domain);

  return group_domain;  
}

/* Compute the iteration domain used by the io_group and add the 
 * domain as a filter at the top of the schedule tree.
 */
static __isl_give isl_schedule_node *insert_io_group_access_domain(
  __isl_take isl_schedule_node *node, 
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int read)
{
  isl_union_set *group_domain;
  group_domain = compute_io_group_access_domain(node, group, kernel, read);  
  node = isl_schedule_node_insert_filter(node, group_domain);
  return node;
}

static __isl_give isl_union_set *compute_io_group_access_domain_local_reduce(
  __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int read, int io_group, int drain_group)
{
  isl_union_map *group_access;
  isl_union_set *group_domain;
  isl_union_map *prefix;
  isl_schedule_node *node_tmp;

  node_tmp = isl_schedule_node_copy(node);
  group_access = isl_union_map_empty(isl_map_get_space(group->access));

  if (io_group) {
    struct autosa_array_ref_group *cur_group = group;
    group_access = isl_union_map_union(group_access,
                                       autosa_io_group_access_relation(cur_group, kernel, read, !read));  
    /* Remove the local accesses below the array level. */  
    node_tmp = autosa_tree_move_up_to_kernel(node_tmp);  
    node_tmp = autosa_tree_move_down_to_array(node_tmp, kernel->core);
    prefix = isl_schedule_node_get_prefix_schedule_relation(node_tmp);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              isl_union_pw_multi_aff_copy(kernel->contraction));
    if (group->local_array->array_type == AUTOSA_INT_ARRAY)
      group_access = remove_local_accesses_group_flow(kernel, cur_group, group_access, prefix, read);  
    isl_union_map_free(prefix);                                                                
  }
  if (drain_group) {
    struct autosa_array_ref_group *cur_group = group->attached_drain_group;
    group_access = isl_union_map_union(group_access,
                                       autosa_io_group_access_relation(cur_group, kernel, read, !read));
  }
  isl_schedule_node_free(node_tmp);

  group_domain = isl_union_map_domain(group_access);
  group_domain = isl_union_set_coalesce(group_domain);

  return group_domain;  
}

/* Compute the iteration domain used by the io_group and add the 
 * domain as a filter at the top of the schedule tree.
 * If io_group is one, consider io_group domain.
 * If drain_group is one, consider the attached drain group domain.
 */
static __isl_give isl_schedule_node *insert_io_group_access_domain_local_reduce(
  __isl_take isl_schedule_node *node, 
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int read, int io_group, int drain_group)
{
  isl_union_set *group_domain;
  group_domain = compute_io_group_access_domain_local_reduce(node, group, kernel, read, io_group, drain_group);
  node = isl_schedule_node_insert_filter(node, group_domain);  
  return node;
}

/* Insert a filter node that filters the valid access domain of the current
 * io group. The "node" should point to the "kernel" mark, and will be returned 
 * at the "kernel" mark.
 */
__isl_give isl_schedule_node *insert_io_group_domain(
  __isl_take isl_schedule_node *node, 
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_gen *gen,
  int read)
{
  node = isl_schedule_node_child(node, 0); // context
  if (gen->options->autosa->local_reduce && group->attached_drain_group) 
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, read, 0, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, read);
  node = autosa_tree_move_up_to_kernel(node);

  return node;
}

static __isl_give isl_union_set *compute_io_group_domain(
  __isl_keep isl_schedule_node *node, 
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_gen *gen,
  int read)
{
  isl_union_set *domain;
  node = autosa_tree_move_down_to_kernel(isl_schedule_node_copy(node));
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    domain = compute_io_group_access_domain_local_reduce(node, group, kernel, read, 1, 1);
  else
    domain = compute_io_group_access_domain(node, group, kernel, read);
  isl_schedule_node_free(node);

  return domain;
}

/* Compute the minimal group domain to filter the elements at the io_level "level.
 * The original group domain is first inserted at root.
 * Then, we compute the prefix schedule down to the io_level "level".
 * Next, we derive the range of the prefix schedule, and compute the 
 * reverse elements that are required for this range set.
 */
static __isl_give isl_union_set *compute_io_group_domain_at_level(
  __isl_keep isl_union_set *group_domain,
  __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int level
){
  isl_union_map *prefix, *filter_prefix;
  isl_union_set *filter_range, *filter_domain;
  
  node = autosa_tree_move_down_to_io_mark(isl_schedule_node_copy(node), kernel->core, level);
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);

  node = isl_schedule_node_insert_filter(node, isl_union_set_copy(group_domain));
  node = isl_schedule_node_child(node, 0);
  filter_prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  isl_schedule_node_free(node);
  filter_range = isl_union_map_range(filter_prefix);
  prefix = isl_union_map_reverse(prefix);
  filter_domain = isl_union_set_apply(filter_range, prefix);

  return filter_domain;
}

/* Extend the group domain so that the domain sets include elements that are
 * lexicographically less or equal to the IO band at the io_level "level".
 */
static __isl_give isl_union_set *extend_io_group_domain(
  __isl_take isl_union_set *group_domain,
  __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  int level
){
//#ifdef _DEBUG
//  DBGUSET(stdout, group_domain, isl_schedule_node_get_ctx(node));
//#endif
  isl_union_map *prefix;
  isl_set *group_range, *all_range;
  isl_map *ge;

  /* Get the all range */
  node = autosa_tree_move_down_to_io_mark(isl_schedule_node_copy(node), kernel->core, level);  
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  all_range = isl_set_from_union_set(isl_union_map_range(isl_union_map_copy(prefix)));

  //node = isl_schedule_node_insert_filter(node, isl_union_set_copy(group_domain));
  //node = isl_schedule_node_child(node, 0);
  //prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  isl_schedule_node_free(node);
  group_range = isl_set_from_union_set(isl_union_set_apply(group_domain, isl_union_map_copy(prefix)));
//#ifdef _DEBUG
//  DBGSET(stdout, group_range, kernel->ctx);
//#endif
  ge = isl_map_lex_ge(isl_set_get_space(group_range));
  /* Set the dimensions except the last one as equal */
  for (int i = 0; i < isl_set_dim(group_range, isl_dim_set) - 1; i++) {
    ge = isl_map_equate(ge, isl_dim_in, i, isl_dim_out, i);
  }
  ge = isl_map_intersect_domain(ge, isl_set_copy(all_range));
  ge = isl_map_intersect_range(ge, all_range);
//#ifdef _DEBUG
//  DBGMAP(stdout, ge, kernel->ctx);
//#endif
  group_range = isl_set_apply(group_range, ge);
  group_range = isl_set_coalesce(group_range);
//#ifdef _DEBUG
//  DBGSET(stdout, group_range, kernel->ctx);
//#endif  
  prefix = isl_union_map_reverse(prefix);
  group_domain = isl_union_set_apply(isl_union_set_from_set(group_range), prefix);

  return group_domain;
} 

static __isl_give isl_schedule_node *insert_io_stmts_acc(
  __isl_take isl_schedule_node *node,
  int nxt_data_pack,
  __isl_take isl_printer *p,
  struct autosa_kernel *kernel, 
  struct autosa_array_ref_group *group,
  struct autosa_io_buffer *buf, /* Local buffer */
  int read, int is_buffer, 
  struct autosa_hw_module *module,
  int module_type
)
{
  char *stmt_name;

  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, nxt_data_pack);
  stmt_name = isl_printer_get_str(p);
  isl_printer_free(p);

  int insert_hls_dep = is_buffer && !read && 
                       buf->n_lane != nxt_data_pack && 
                       kernel->options->autosa->insert_hls_dependence;

  node = add_io_copies_stmt_acc(kernel, group, node,
                                buf->tile, nxt_data_pack, read, stmt_name, read ? 1 : 0,
                                insert_hls_dep, module, module_type);

  return node;
}

static __isl_give isl_schedule_node *insert_io_stmts_tile(
    __isl_take isl_schedule_node *node,    
    int nxt_data_pack,
    __isl_take isl_printer *p,
    struct autosa_kernel *kernel, 
    struct autosa_array_ref_group *group,
    //struct autosa_io_buffer *buf,
    struct autosa_io_buffer *local_buffer,      /* local buffer */
    struct autosa_io_buffer *copy_buffer,       /* buffer to be transferred */
    int read, int is_buffer,
    struct autosa_hw_module *module,
    int cut, /* If to cut the sub tree */
    int module_type,
    int if_depth /* If branch sched depth */
)
{
  char *stmt_name;
  int coalesce_bound;

  p = isl_printer_print_str(p, ".");
  p = isl_printer_print_int(p, nxt_data_pack);  
  
  p = print_trans_stmt_coalesce(p, node, copy_buffer, &coalesce_bound, nxt_data_pack);   
  module->coalesce_bound = coalesce_bound;
  
  if (if_depth != -1) {
    p = isl_printer_print_str(p, ".");
    p = isl_printer_print_int(p, if_depth);
  }

  stmt_name = isl_printer_get_str(p);
  isl_printer_free(p);

  int insert_hls_dep = coalesce_bound > 1 && 
                       copy_buffer->n_lane != nxt_data_pack && 
                       kernel->options->autosa->insert_hls_dependence;  

  node = add_io_copies_stmt_tile(kernel, group, node,
                                 local_buffer->tile? local_buffer->tile : NULL, copy_buffer->tile, 
                                 nxt_data_pack,
                                 //local_buffer? local_buffer->n_lane : nxt_data_pack,
                                 read, stmt_name, read ? 1 : 0,
                                 //nxt_data_pack, read, stmt_name, read ? 1 : 0,
                                 is_buffer & 0,
                                 insert_hls_dep,
                                 module->is_serialized,
                                 module, module_type, copy_buffer->tuning_tile);

  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
  
  if (cut) {
    node = isl_schedule_node_cut(node);
    /* Insert empty filter. */
    isl_union_set *empty_filter = isl_union_set_from_set(isl_set_empty(
          isl_set_get_space(kernel->context)));
    node = isl_schedule_node_insert_filter(node, empty_filter);
  }  

  return node;
}

static __isl_give isl_schedule_node *insert_filter_trans_stmts(
  __isl_take isl_schedule_node *node,
  isl_id_list *io_ids,
  int io_id_level,
  int io_level,
  int read,
  struct autosa_io_buffer *buf,
  struct autosa_hw_module *module,
  struct autosa_kernel *kernel,
  struct autosa_gen *gen,
  int boundary, int is_lower,
  int is_buffer,
  char *fifo_suffix,
  struct autosa_array_ref_group *group,
  __isl_keep isl_union_set *group_core,
  int module_type
)
{
  isl_id_list *ids;
  isl_union_set *eq_filter, *neq_filter;
  isl_ctx *ctx;
  isl_printer *p;
  int upper_io_level;
  int lower_if = gen->options->autosa->lower_if_branch;
  int if_depth = -1;
  
  ctx = isl_schedule_node_get_ctx(node);
  if (io_id_level < 0) {
    /* This is the highest-level module that also connects to the DRAM.
     * Filter node is not required, since all data belongs to this module.
     */
    if (boundary == 0) {
      return isl_schedule_node_free(node);
    } else {
      node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
      node = isl_schedule_node_child(node, 0);
      goto INSERT_STMT;
    }
  }

  if (lower_if) {
    /* Lower the if branch inside the user statement. */
    node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
    if_depth = isl_schedule_node_get_schedule_depth(node) -  1;

    node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
    node = isl_schedule_node_child(node, 0);
    goto INSERT_STMT;
  }

  node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
  node = isl_schedule_node_parent(node);
  ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, io_id_level));
  eq_filter = set_schedule_eq(node, ids);  
  isl_id_list_free(ids);
  
  upper_io_level = io_level + 1;
  node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);  
  node = isl_schedule_node_child(node, 0);  
  node = isl_schedule_node_order_before(node, eq_filter); // point to the second tree.    

  /* Pass the data not filtered */  
  if (boundary) {
    isl_union_set *empty_filter = isl_union_set_from_set(isl_set_empty(isl_set_get_space(kernel->context)));
    node = isl_schedule_node_cut(node);
    node = isl_schedule_node_insert_filter(node, empty_filter);
  } else {
    if (io_level != buf->level) {
      node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
      node = isl_schedule_node_child(node, 0);
    }
    p = isl_printer_to_str(ctx);
    p = print_io_trans_stmt_prefix(
          p, read, module->to_mem, gen->options->autosa->host_serialize, boundary, 0, NULL,
          0, 0, 0, fifo_suffix, buf->n_lane);    
    if (!buf->tile) {
      node = insert_io_stmts_acc(node, buf->n_lane, p, kernel, group, buf, read, is_buffer, module, module_type);   
    } else {
      node = insert_io_stmts_tile(node, buf->n_lane, p, kernel, group, buf, buf, read, is_buffer, module, 1, module_type, -1);
    }
  }

  /* Keep the data filtered */
  node = autosa_tree_move_up_to_kernel(node);  
  node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
  node = isl_schedule_node_child(node, 0); // seqeuence
  node = isl_schedule_node_child(node, 0); // filter  
  node = isl_schedule_node_child(node, 0); // filter  

  if (io_level != buf->level) {
    node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
    node = isl_schedule_node_child(node, 0);
  }  

INSERT_STMT:    
  p = isl_printer_to_str(ctx);
  p = print_io_trans_stmt_prefix(
        p, read, module->to_mem, gen->options->autosa->host_serialize, boundary, 0, NULL,
        !read && is_lower ? 1 : 0, read && is_lower? 1 : 0, is_buffer, fifo_suffix, buf->n_lane);

  if (!buf->tile)  {
    node = insert_io_stmts_acc(node, buf->n_lane, p, kernel, group, buf, read, is_buffer, module, module_type);   
  } else {
    node = insert_io_stmts_tile(node, buf->n_lane, p, kernel, group, buf, buf, read, is_buffer, module, 1, module_type, if_depth);
  }  

  return node;
}

/* The node points to the "kernel" mark.
 */
static int get_local_reduce_sched_depth(
  __isl_take isl_schedule_node *node,
  struct autosa_kernel *kernel)
{
  node = autosa_tree_move_down_to_array(node, kernel->core);
  if (kernel->array_part_w > 0) {
    int pos = 0;
    int n;
    node = isl_schedule_node_parent(node);
    n = isl_schedule_node_band_n_member(node);
    for (pos = n - 1; pos >= 0; pos--)
    {
      if (isl_schedule_node_band_member_get_coincident(node, pos))
        break;
    }
    if (pos == n - 1) {
      node = isl_schedule_node_child(node, 0);
    } else {
      node = isl_schedule_node_band_split(node, pos + 1);
      node = isl_schedule_node_child(node, 0);      
    }
  }

  int depth = isl_schedule_node_get_schedule_depth(node);
  isl_schedule_node_free(node);

  return depth;
}

/* Generate the inter_trans module for the I/O group.
 * We will add data transfer statements into the schedule tree, 
 * filters that restrain the space loops to the current module,
 * and add the module and function type mark above the tree.
 */
static __isl_give isl_schedule *generate_io_module_inter_trans(
  __isl_keep isl_schedule *sched, struct autosa_hw_module *module,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel, struct autosa_gen *gen,
  int io_level, int space_dim, int read, int boundary)
{
  isl_schedule *new_sched;
  isl_ctx *ctx;
  isl_printer *p;  
  int n_io_ids;
  isl_id_list *io_ids;
  isl_id *id;
  char *fifo_suffix, *buf_suffix;
  isl_union_set *empty_filter = NULL;  
  char *stmt_name;
  struct autosa_io_buffer *buf = NULL;  
  isl_schedule_node *node;
  int upper_io_level = io_level + 1;
  int is_filter = 1;
  int is_buffer = 1;
  int i;
  isl_union_set *group_core = NULL;

  if (io_level > space_dim && boundary == 0) {
    return NULL;
  }

  new_sched = isl_schedule_dup(sched);
  //DBGSCHD(stdout, new_sched, gen->ctx);
  node = isl_schedule_get_root(new_sched);
  isl_schedule_free(new_sched);
  ctx = isl_schedule_node_get_ctx(node);
  
  /* Compute the union of domains of all the array references in the group. */
  node = autosa_tree_move_down_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, read, 0, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, read);
  node = isl_schedule_node_child(node, 0);
  group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));
  node = autosa_tree_move_up_to_kernel(node);
  
  /* Add the filters. */
  n_io_ids = space_dim - io_level + 1;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  n_io_ids = 0;  
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = add_io_ids_filter(node, io_ids, io_level, space_dim - io_level + 1, is_filter, 0, read);
  node = autosa_tree_move_up_to_kernel(node);
  //DBGSCHDNODE(stdout, node, ctx);

  /* Locate the buffer. */
  for (i = io_level; i >= 1; i--)
  {
    buf = group->io_buffers[i - 1];
    if (buf->tile != NULL)
      break;
  }
  if (is_buffer)
  {
    if (i != io_level)
    {
      /* IO buffer is optimized out. */
      is_buffer = 0;
    }
  }

  if (buf->tile && buf->hoist_depth != -1) {
    /* This buffer has been hoisted. */    
    node = isl_schedule_node_child(node, 0); // context
    node = isl_schedule_node_child(node, 0); // last inserted filter
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_filter(node, isl_union_set_copy(buf->hoist_domain));
    node = isl_schedule_node_child(node, 0);
    isl_union_set_free(group_core);
    group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));    
    node = autosa_tree_move_up_to_kernel(node);
  }
  
  init_suffix(module, group, &fifo_suffix, &buf_suffix);
  node = insert_filter_trans_stmts(node, io_ids, space_dim - io_level, io_level, read,
      buf, module, kernel, gen, boundary, 0, is_buffer, fifo_suffix, group, group_core, 2);

  free(fifo_suffix);
  free(buf_suffix);      
  isl_id_list_free(io_ids);
  if (!node) {
    isl_union_set_free(group_core);
    return NULL;  
  }

  module->data_pack_inter = buf->n_lane;
  /* Insert the "io_module.inter_trans" function mark. */
  node = autosa_tree_move_up_to_kernel(node);  
  if (gen->options->autosa->local_reduce && group->attached_drain_group) {
    node = autosa_tree_move_down_to_depth(
              node, 
              get_local_reduce_sched_depth(isl_schedule_node_copy(node), kernel), 
              kernel->core);    
  } else {
    if (io_level > space_dim) {
      node = autosa_tree_move_down_to_array(node, kernel->core);
      node = isl_schedule_node_child(node, 0);  
    } else {      
      node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
      node = isl_schedule_node_parent(node);
      node = isl_schedule_node_parent(node);
    }    
  }
  
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    id = isl_id_alloc(ctx, "synth", NULL);
    node = isl_schedule_node_insert_mark(node, id);
    node = autosa_tree_move_up_to_kernel(node);
    node = isl_schedule_node_child(node, 0);
  }
  
  id = isl_id_alloc(ctx, "io_module.inter_trans", NULL);
  node = isl_schedule_node_insert_mark(node, id);

  /* Add the module mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  new_sched = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);  
  isl_union_set_free(group_core);

  return new_sched;
}

/* The "node" points to the kernel mark. 
 * This function should be called before inserting module ids into the schedule.
 */
static __isl_give isl_schedule_node *insert_io_group_guard(
  __isl_take isl_schedule_node *node, 
  struct autosa_gen *gen,
  struct autosa_kernel *kernel,
  int n_io_ids)
{
  isl_union_set *domain;
  isl_set *guard;
  isl_schedule_node *node_tmp;
  isl_id_list *io_ids;
  
  node_tmp = isl_schedule_node_copy(node);
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  node_tmp = add_io_ids_filter(node_tmp, io_ids, 1, n_io_ids, 0, 0, 0);  
  domain = isl_schedule_node_get_domain(node_tmp);
  guard = isl_union_set_params(domain);
  guard = isl_set_from_params(guard);
  isl_schedule_node_free(node_tmp);
  isl_id_list_free(io_ids);
  
//#ifdef _DEBUG
//  DBGSET(stdout, guard, isl_set_get_ctx(guard));
//#endif

  //node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context;
  node = isl_schedule_node_child(node, 0); // filter;
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_guard(node, guard);
  node = autosa_tree_move_up_to_kernel(node);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  return node;
}

static __isl_give isl_set *get_io_group_guard(
  __isl_keep isl_schedule_node *node,
  struct autosa_gen *gen,
  struct autosa_kernel *kernel,
  int n_io_ids)
{
  isl_union_set *domain;
  isl_set *guard;
  isl_schedule_node *node_tmp;
  isl_id_list *io_ids;
  int depth;
  
  node_tmp = isl_schedule_node_copy(node);
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");  
  node_tmp = add_io_ids_filter(node_tmp, io_ids, 1, n_io_ids, 0, 0, 0);  
  isl_id_list_free(io_ids);

  domain = isl_schedule_node_get_domain(node_tmp);
  guard = isl_union_set_params(domain);
  guard = isl_set_from_params(guard);
  isl_schedule_node_free(node_tmp);
  
  return guard;
}

/* Generate the intra_trans module for the I/O group.
 * We will add data transfer statements into the schedule tree that 
 * transfer data to/from the lower-level modules,
 * filters that restrain the space loops to the current module,
 * and add the module and function type mark above the tree.
 */
static __isl_give isl_schedule *generate_io_module_intra_trans(
  __isl_keep isl_schedule *sched, struct autosa_hw_module *module,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel, struct autosa_gen *gen,
  int io_level, int space_dim, int read, int is_buffer)
{
  isl_ctx *ctx;
  isl_printer *p;  
  int n_io_ids;
  isl_id_list *io_ids;  
  isl_id *id;    
  char *fifo_suffix, *buf_suffix;
  isl_union_set *empty_filter = NULL;    
  char *stmt_name;
  struct autosa_io_buffer *buf = NULL;    
  isl_schedule *new_sched;
  isl_schedule_node *node;  
  int i;
  isl_set *guard;
  isl_schedule_node *node_tmp;
  isl_union_set *group_core = NULL;
  isl_union_set *group_domain;

  new_sched = isl_schedule_dup(sched);
  node = isl_schedule_get_root(new_sched);  
  node = autosa_tree_move_down_to_kernel(node);
  isl_schedule_free(new_sched);
  ctx = isl_schedule_node_get_ctx(node);
  n_io_ids = space_dim - io_level + 1;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");  
  int upper_io_level = io_level + 1;

  /* Insert the group domain. */   
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, read, 1, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, read);  
  node = isl_schedule_node_child(node, 0);
  group_core = isl_union_set_universe(isl_schedule_node_get_domain(node)); 
  node = autosa_tree_move_up_to_kernel(node);

  /* Add the filters. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = add_io_ids_filter(node, io_ids, io_level, space_dim - io_level + 1, 0, module->to_pe, read);
  node = autosa_tree_move_up_to_kernel(node);  

  /* Add the data transfer statements. */
  init_suffix(module, group, &fifo_suffix, &buf_suffix);

  /* Locate the current buffer. */
  for (i = io_level; i >= 1; i--)
  {
    buf = group->io_buffers[i - 1];
    if (buf->tile != NULL)
      break;
  }  
  if (is_buffer)
  {
    if (i != io_level)
    {
      /* IO buffer is optimized out. */
      is_buffer = 0;
    }
  }

  /* Insert the extra transfer statement. */
  p = isl_printer_to_str(ctx);
  p = print_io_trans_stmt_prefix(p, !read, 0, 0, 0, 
                                 gen->options->autosa->local_reduce && group->attached_drain_group,
                                 gen->options->autosa->reduce_op,
                                 !read, read, is_buffer, fifo_suffix, buf->n_lane);

  /* Locate the next buffer after the current buffer. */
  int cur_level = buf->level;
  struct autosa_io_buffer *cur_buf = buf;
  for (int i = cur_level - 1; i >= 1; i--)
  {
    buf = group->io_buffers[i - 1];
    if (buf->tile != NULL)
      break;
  }

  if (cur_level == 1 || !buf->tile)
  {
    node = insert_io_stmts_acc(node, group->n_lane, p, kernel, group, cur_buf, read, is_buffer, module, 1);
    module->data_pack_intra = group->n_lane;                                  
  }
  else
  {
    /* Move the schedule node to the level of the next buffer. */
    node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
    node = isl_schedule_node_child(node, 0);    
    node = insert_io_stmts_tile(
                node, buf->n_lane, p, kernel, group, 
                cur_buf, buf, !read, is_buffer, module, 1, 1, -1);
    module->data_pack_intra = buf->n_lane;    
  }

  free(fifo_suffix);
  free(buf_suffix);

  /* Insert the function mark. */    
  node = autosa_tree_move_up_to_kernel(node);
  
  if (gen->options->autosa->local_reduce && group->attached_drain_group) {
    node = autosa_tree_move_down_to_depth(
              node, 
              get_local_reduce_sched_depth(isl_schedule_node_copy(node), kernel), 
              kernel->core);    
  } else {
    if (io_level > space_dim) {
      node = autosa_tree_move_down_to_array(node, kernel->core);      
      node = isl_schedule_node_child(node, 0);  
    } else {
      if (cur_buf->tile && cur_buf->hoist_depth != -1) {
        /* This buffer has been hoisted. */        
        node = autosa_tree_move_down_to_depth(node, cur_buf->hoist_depth, kernel->core);
      } else {
        node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
        node = isl_schedule_node_parent(node);
        node = isl_schedule_node_parent(node);
      }
    }    
  }  

  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    id = isl_id_alloc(ctx, "synth", NULL);
    node = isl_schedule_node_insert_mark(node, id);
    node = autosa_tree_move_up_to_kernel(node);
    node = isl_schedule_node_child(node, 0);
  }

  id = isl_id_alloc(ctx, "io_module.intra_trans", NULL);
  if (kernel->array_part_w == 0 && isl_schedule_node_get_schedule_depth(node) < group->io_level) {
    node = autosa_tree_move_up_to_kernel(node);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_mark(node, id);  
  } else {
    node = isl_schedule_node_insert_mark(node, id);  
  }  

  /* Add the module mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  /* Make the node atomic */
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  node = autosa_atomic_ancestors(node);
  new_sched = isl_schedule_node_get_schedule(node);

  isl_schedule_node_free(node);
  isl_id_list_free(io_ids);
  isl_union_set_free(group_core);

  return new_sched;
}

/* Create the local buffer variable for the "group".
 * Specifically, if "tile" is NULL, a register is created.
 * Otherwise, a local array is created. 
 * We will also update the last dimension of the array based on the 
 * data packing factor "n_lane".
 */
static void create_io_module_var(isl_ctx *ctx,
                                 struct autosa_array_ref_group *group,
                                 struct autosa_array_tile *tile, struct autosa_kernel_var *var, int n_lane)
{
  isl_printer *p;

  var->array = group->array;
  var->type = autosa_array_ref_group_type(group);
  var->n_lane = n_lane;
  var->n_part = 1;

  p = isl_printer_to_str(ctx);
  p = autosa_array_ref_group_print_name(group, p);
  var->name = isl_printer_get_str(p);
  isl_printer_free(p);

  if (tile == NULL)
  {
    /* Create a register. */
    var->size = isl_vec_alloc(ctx, 1);
    var->size = isl_vec_set_element_si(var->size, 0, 1);
  }
  else
  {
    var->size = isl_vec_alloc(ctx, group->array->n_index);
    for (int i = 0; i < group->array->n_index; ++i)
    {
      isl_val *size;

      size = isl_val_copy(tile->bound[i].size);
      if (i == group->array->n_index - 1) {        
        if (group->local_array->is_sparse) {
          size = isl_val_div(size, isl_val_int_from_si(ctx, n_lane * group->local_array->vec_len));          
        } else {
          if (n_lane > 1)
            size = isl_val_div(size, isl_val_int_from_si(ctx, n_lane));          
        }
      }      
      var->size = isl_vec_set_element_val(var->size, i, size);
    }
  }
}

/* Create the local buffers inside the I/O modules. */
static isl_stat create_io_module_vars(
    struct autosa_hw_module *module, struct autosa_kernel *kernel,
    struct autosa_array_tile *tile, int init_required)
{
  module->var = isl_calloc_array(kernel->ctx, struct autosa_kernel_var, 1);
  if (!module->var)
    return isl_stat_error;
  module->n_var = 1;
  module->var[0].init_required = init_required;

  create_io_module_var(kernel->ctx, module->io_groups[0],
                       tile, &module->var[0], module->data_pack_inter);

  return isl_stat_ok;
}

/* Generate the io_module for the outer loops that contain the 
 * inter_trans and intra_trans modules.
 */
static __isl_give isl_schedule *generate_io_module_outer(
    __isl_keep isl_schedule *sched, struct autosa_hw_module *module,
    struct autosa_array_ref_group *group,
    struct autosa_kernel *kernel, struct autosa_gen *gen,
    int io_level, int space_dim, int read, int boundary)
{
  isl_ctx *ctx;
  int n_io_ids;
  isl_id_list *io_ids;
  isl_id *id;
  isl_union_set *empty_filter = NULL;
  const char *stmt_name1, *stmt_name2, *stmt_name5;  
  char *stmt_name3, *stmt_name4;
  isl_schedule_node *node, *graft1, *graft2, *graft3, *graft4, *graft5;
  isl_schedule *new_sched;
  int upper_io_level;
  isl_space *space;
  isl_union_set *domain;
  struct autosa_io_buffer *buf;
  isl_union_set *group_core = NULL;

  if (io_level > space_dim && boundary == 0) {
    return NULL;
  }

  new_sched = isl_schedule_dup(sched);
  node = isl_schedule_get_root(new_sched);
  isl_schedule_free(new_sched);
  ctx = isl_schedule_node_get_ctx(node);
  n_io_ids = space_dim - io_level + 1;

  /* Compute the union of domains of all the array references in the group. */
  node = autosa_tree_move_down_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, read, 1, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, read);
  node = isl_schedule_node_child(node, 0);
  group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));
  node = autosa_tree_move_up_to_kernel(node);

  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  n_io_ids = 0;
  
  if (io_level > space_dim && boundary == 1) {    
    goto OUTER_INSERT_STMT;
  }

  upper_io_level = io_level + 1;
  /* Add the filters. */
  n_io_ids = 0;
  node = autosa_tree_move_down_to_array(node, kernel->core);
  while (!isl_schedule_node_is_io_mark(node, upper_io_level))
  {
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      isl_id *id;
      isl_id_list *ids;
      isl_union_set *uset;

      ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, n_io_ids));
      uset = set_schedule_eq(node, ids);
      n_io_ids++;
      node = isl_schedule_node_insert_filter(node, uset);
      isl_id_list_free(ids);
      node = isl_schedule_node_child(node, 0);
    }
    node = isl_schedule_node_child(node, 0);
  }
  node = autosa_tree_move_up_to_kernel(node);

  /* Locate the buffer */
  buf = group->io_buffers[io_level - 1];
  if (buf->tile && buf->hoist_depth != - 1) {
    /* This buffer has been hoisted. */
    node = isl_schedule_node_child(node, 0); // context
    node = isl_schedule_node_child(node, 0); // last inserted filter
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_filter(node, isl_union_set_copy(buf->hoist_domain));
    node = isl_schedule_node_child(node, 0);
    isl_union_set_free(group_core);
    group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));    
    node = autosa_tree_move_up_to_kernel(node);
  }

OUTER_INSERT_STMT:  
  if (gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C) {
    if (gen->options->autosa->local_reduce && group->attached_drain_group) {
      node = autosa_tree_move_down_to_depth(
                node, 
                get_local_reduce_sched_depth(isl_schedule_node_copy(node), kernel), 
                kernel->core);        
    } else {
      if (io_level > space_dim && boundary == 1) {
        node = autosa_tree_move_down_to_array(node, kernel->core);
        node = isl_schedule_node_child(node, 0);              
      } else {      
        node = autosa_tree_move_down_to_io_mark(node, group_core, io_level);
        node = isl_schedule_node_parent(node);      
      }    
    }
  } else {
    /* Move to the node below the kernel mark. */
    node = isl_schedule_node_child(node, 0);
  }
  isl_union_set_free(group_core);

  /* Add the inter_trans and intra_trans function calls. */  
  stmt_name1 = boundary == 0 ? "io_module.inter_trans.0" : "io_module.inter_trans.1";
  stmt_name2 = "io_module.intra_trans";
  isl_printer *p_str = isl_printer_to_str(ctx);
  if (boundary == 0)
    p_str = isl_printer_print_str(p_str, "io_module.inter_intra.0.");
  else
    p_str = isl_printer_print_str(p_str, "io_module.inter_intra.1.");
  if (module->double_buffer)
    p_str = isl_printer_print_int(p_str, 1);
  else
    p_str = isl_printer_print_int(p_str, 0);
  stmt_name3 = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  p_str = isl_printer_to_str(ctx);
  if (boundary == 0)
    p_str = isl_printer_print_str(p_str, "io_module.intra_inter.0.");
  else
    p_str = isl_printer_print_str(p_str, "io_module.intra_inter.1.");
  if (module->double_buffer)
    p_str = isl_printer_print_int(p_str, 1);
  else
    p_str = isl_printer_print_int(p_str, 0);
  stmt_name4 = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  
  stmt_name5 = "io_module.state_handle";  
  
  node = isl_schedule_node_cut(node);

  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name1);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft1 = isl_schedule_node_from_domain(domain);

  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name2);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft2 = isl_schedule_node_from_domain(domain);

  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name3);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft3 = isl_schedule_node_from_domain(domain);

  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name4);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft4 = isl_schedule_node_from_domain(domain);

  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name5);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft5 = isl_schedule_node_from_domain(domain);

  free(stmt_name3);
  free(stmt_name4);

  if (read)
  {
    node = isl_schedule_node_graft_before(node, isl_schedule_node_copy(graft3));
  }
  else
  {
    node = isl_schedule_node_graft_before(node, isl_schedule_node_copy(graft4));
  }
  if (module->double_buffer && gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C)
  {
    /* Add misc statements for saving and switching states. */
    node = isl_schedule_node_graft_before(node, isl_schedule_node_copy(graft5));
  }
  node = isl_schedule_node_cut(node);
  /* Insert an empty filter */
  empty_filter = isl_union_set_from_set(isl_set_empty(
      isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  if (module->double_buffer && gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C)
  {
    /* Ignore it if tuning_method is 1. It will considered later in the latency estimation. */
    if (gen->options->autosa->tuning_method != 1) {
      /* Add the last function call. */
      node = autosa_tree_move_up_to_kernel(node);
      node = isl_schedule_node_child(node, 0);
      node = isl_schedule_node_child(node, 0);
      node = isl_schedule_node_child(node, 0);
      if (read)
        node = isl_schedule_node_graft_after(node, isl_schedule_node_copy(graft2));
      else
        node = isl_schedule_node_graft_after(node, isl_schedule_node_copy(graft1));
    }
  }
  isl_schedule_node_free(graft1);
  isl_schedule_node_free(graft2);
  isl_schedule_node_free(graft3);
  isl_schedule_node_free(graft4);
  isl_schedule_node_free(graft5);

  /* Add the module mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  new_sched = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  /* Update module information. */
  if (!boundary || (io_level > space_dim && boundary == 1))
  {
    module->type = (group->group_type == AUTOSA_DRAIN_GROUP) ? DRAIN_MODULE : IO_MODULE;
    module->level = io_level;
    module->n_io_group++;
    module->io_groups = (struct autosa_array_ref_group **)realloc(module->io_groups,
                                                                  module->n_io_group * sizeof(struct autosa_array_ref_group *));
    module->io_groups[module->n_io_group - 1] = group;
    module->inst_ids = io_ids;
    module->kernel = kernel;
    module->is_buffer = 1;
    module->is_filter = 1;
    /* Create IO module variables. */
    for (int i = io_level; i >= 1; i--)
    {
      buf = group->io_buffers[i - 1];
      if (buf->tile != NULL)
        break;
    }
    if (gen->options->autosa->local_reduce && group->attached_drain_group) {
      create_io_module_vars(module, kernel, buf->tile, 1);
    } else {
      create_io_module_vars(module, kernel, buf->tile, 0);
    }
  }
  else
  {
    isl_id_list_free(io_ids);
  }

  return new_sched;
}

/* We will generate five seperate schedules for this type of I/O module.
 * Schedule 1: Outer loops contains two marks for inter_transfer 
 *             and intra_transfer modules
 * Schedule 2: Inter_transfer function
 * Schedule 3: Intra_transfer function
 * Schedule 4: The boundary module for outer loops that is the last module
 *             in the chain.
 * Schedule 5: The boundary module for inter_transfer that is the last module
 *             in the chain.
 */
static __isl_give struct autosa_hw_module *generate_filter_buffer_io_module(
    __isl_take struct autosa_hw_module *module,
    __isl_keep isl_schedule_node *node,
    struct autosa_array_ref_group *group, struct autosa_kernel *kernel,
    struct autosa_gen *gen,
    int io_level, int space_dim, int is_filter, int is_buffer, int read)
{
  isl_schedule *sched;
  isl_schedule *sched1, *sched2, *sched3;
  isl_schedule *boundary_sched2, *boundary_sched1;

  sched = isl_schedule_node_get_schedule(node);
  
  if (gen->options->autosa->double_buffer && kernel->array_part_w > 0)
  {
    isl_union_map *double_buffer_assignment;
    /* Check if the double buffer assignment exists. */    
    double_buffer_assignment = extract_sizes_from_str(kernel->ctx, gen->options->autosa->double_buffer_assignment);    
    if (!double_buffer_assignment) {
      /* Use the default strategy:
       * Set all the modules to double buffer except the drain module.       
       */      
      if (group->group_type == AUTOSA_DRAIN_GROUP) {
        module->double_buffer = 0;
      } else {
        module->double_buffer = 1;
      }      
    } else {
      isl_set *tmp;
      tmp = extract_sa_sizes(double_buffer_assignment, group->local_array->array->name);      
      
      if (tmp) {        
        module->double_buffer = 1;        
      }
      isl_set_free(tmp);
    }
    isl_union_map_free(double_buffer_assignment);
  }
  else
  {    
    module->double_buffer = 0;
  }

  /* Inter transfer function. */
  sched2 = generate_io_module_inter_trans(sched, module, group, kernel, gen,
                                          io_level, space_dim, read, 0);
  if (is_filter)
  {
    /* Add the boundary module schedule. */
    module->boundary = 1;
    boundary_sched2 = generate_io_module_inter_trans(sched, module, group,
                                                     kernel, gen, io_level, space_dim, read, 1);
  }  
  /* Intra transfer function. */
  sched3 = generate_io_module_intra_trans(sched, module, group, kernel, gen,
                                          io_level, space_dim, read, is_buffer);
  /* Outer loops. */  
  sched1 = generate_io_module_outer(sched, module, group, kernel, gen,
                                    io_level, space_dim, read, 0);
  if (is_filter)
  {
    /* Add the boundary module schedule. */    
    module->boundary = 1;
    boundary_sched1 = generate_io_module_outer(sched, module, group, kernel, gen,
                                               io_level, space_dim, read, 1);
  }

  isl_schedule_free(sched);

  module->sched = NULL;
  module->outer_sched = sched1;
  module->inter_sched = sched2;
  module->intra_sched = sched3;
  if (gen->options->autosa->tuning_method == 1) {
    module->tuning_sched = NULL;
    if (sched2)
      module->tuning_inter_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(sched2));
    else
      module->tuning_inter_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(boundary_sched2));
    module->tuning_intra_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(sched3));
    
    module->tuning_num_sched = NULL;
    if (sched2)
      module->tuning_num_inter_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(sched2));
    else
      module->tuning_num_inter_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(boundary_sched2));
    module->tuning_num_intra_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(sched3));

    if (sched1)
      module->tuning_outer_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(sched1));
    else
      module->tuning_outer_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(boundary_sched1));
    /* Remove the filter ids */
    isl_schedule *tuning_sched;
    if (sched1)
      tuning_sched = isl_schedule_dup(sched1);
    else
      tuning_sched = isl_schedule_dup(boundary_sched1);    
    isl_schedule_node *root = isl_schedule_get_root(tuning_sched);        
    if (io_level <= space_dim) {
      root = autosa_tree_move_down_to_io_mark(root, kernel->core, io_level + 1);      
      while (root && isl_schedule_node_has_parent(root)) {
        root = isl_schedule_node_parent(root);
        if (isl_schedule_node_get_type(root) == isl_schedule_node_filter) {
          root = isl_schedule_node_delete(root);
        }
        if (autosa_tree_node_is_mark(root, "array"))
          break;
      }
    }
    if (root) {
      isl_schedule_free(tuning_sched);
      tuning_sched = isl_schedule_node_get_schedule(root);
    }
    isl_schedule_node_free(root);
    module->tuning_num_outer_sched = kernel->tuning_program->generate_tuning_schedule(tuning_sched);
  }  

  if (module->boundary)
  {
    module->boundary_outer_sched = boundary_sched1;
    module->boundary_inter_sched = boundary_sched2;
  }

  return module;
}

/* Internal struct for add_drain_merge_stmt_acc_single. */
struct drain_merge_stmt_acc_data
{
  struct autosa_kernel *kernel;
  struct autosa_array_ref_group *group;
  struct autosa_stmt_access *ref;
};

static __isl_give isl_multi_aff *autosa_create_drain_merge_stmt(
    isl_ctx *ctx,
    struct autosa_array_ref_group *io_group,
    isl_schedule_node *node,
    char *stmt_name)
{
  isl_space *space;
  int depth;
  char buf[100];
  isl_id *id;

  depth = isl_schedule_node_get_schedule_depth(node);
  space = isl_space_copy(io_group->array->space);
  space = isl_space_from_range(space);
  space = isl_space_add_dims(space, isl_dim_in, depth);
  space = isl_space_wrap(space);
  space = isl_space_map_from_set(space);

  sprintf(buf, "%s", stmt_name);

  id = isl_id_alloc(ctx, buf, NULL);
  space = isl_space_set_tuple_id(space, isl_dim_in, id);

  return isl_multi_aff_identity(space);
}

static __isl_give isl_schedule_node *add_drain_merge_stmt_acc_single(
    __isl_take isl_schedule_node *node, void *user)
{
  struct drain_merge_stmt_acc_data *data =
      (struct drain_merge_stmt_acc_data *)(user);
  struct autosa_array_ref_group *group = data->group;
  struct autosa_kernel *kernel = data->kernel;
  struct autosa_stmt_access *ref = data->ref;
  struct autosa_array_tile *tile;
  isl_union_set *uset, *empty_filter, *domain;
  isl_set *set;
  isl_space *space;
  isl_id *id, *id2;
  isl_ctx *ctx;
  isl_union_map *access;
  int empty;
  isl_printer *p_str;
  char *stmt_name;
  isl_multi_aff *from_access, *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_schedule_node *graft;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;

  /* Examine if the statement contains the access. */
  uset = isl_schedule_node_get_domain(node);
  set = isl_set_from_union_set(isl_union_set_copy(uset));
  space = isl_set_get_space(set);
  isl_set_free(set);
  id = isl_space_get_tuple_id(space, isl_dim_set);
  isl_space_free(space);
  space = isl_map_get_space(ref->access);
  id2 = isl_space_get_tuple_id(space, isl_dim_in);
  empty_filter = isl_union_set_empty(isl_union_set_get_space(uset));
  isl_union_set_free(uset);
  isl_space_free(space);

  if (id != id2)
  {
    isl_id_free(id);
    isl_id_free(id2);
    node = isl_schedule_node_insert_filter(node, empty_filter);
    return node;
  }
  isl_id_free(id);
  isl_id_free(id2);
  ctx = isl_schedule_node_get_ctx(node);

  access = io_comm_access_ref(kernel, node, group, ref, 0);
  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    isl_union_set_free(empty_filter);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return node;
  }

  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "drain_merge.");
  p_str = isl_printer_print_str(p_str, group->local_array->array->name);
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  from_access = autosa_create_drain_merge_stmt(ctx, group, node, stmt_name);
  free(stmt_name);

  /* Create a register tiling. */
  tile = create_register_tiling(node, group, ref);
  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

  domain = isl_union_map_range(access);
  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  access = isl_union_set_wrapped_domain_map(domain);
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);
  node = isl_schedule_node_insert_filter(node, empty_filter);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);

  autosa_array_tile_free(tile);

  return node;
}

static __isl_give isl_schedule_node *add_drain_merge_stmt_acc(
    __isl_take isl_schedule_node *node, struct autosa_array_ref_group *group,
    struct autosa_kernel *kernel)
{
  struct drain_merge_stmt_acc_data data = {kernel, group, NULL};
  for (int i = 0; i < group->n_ref; i++)
  {
    data.ref = group->refs[i];
    node = isl_schedule_node_map_descendant_bottom_up(
        node, &add_drain_merge_stmt_acc_single, &data);
  }
  return node;
}

/* This function generats code that merge all drained values from the drain group.
 */
static __isl_give struct autosa_drain_merge_func *generate_drain_merge_func(
    struct autosa_array_ref_group *group, struct autosa_kernel *kernel,
    struct autosa_gen *gen)
{
  isl_ctx *ctx;
  isl_schedule_node *node;
  int io_level;
  int space_dim;
  int n_io_ids;
  isl_id_list *io_ids = NULL;
  isl_union_map *group_access;
  isl_union_set *group_domain;
  isl_schedule *sched;
  isl_id *id;
  struct autosa_drain_merge_func *func = NULL;

  ctx = gen->ctx;
  node = isl_schedule_get_root(group->io_schedule);
  io_level = group->io_level;
  space_dim = group->space_dim;
  n_io_ids = space_dim - io_level + 1;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");

  /* Add the filters. */
  n_io_ids = 0;
  node = autosa_tree_move_down_to_array(node, kernel->core);
  while (!isl_schedule_node_is_io_mark(node, io_level))
  {
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      isl_id *id;
      isl_id_list *ids;
      isl_union_set *uset;

      ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, n_io_ids));
      uset = set_schedule_eq(node, ids);
      n_io_ids++;
      node = isl_schedule_node_insert_filter(node, uset);
      isl_id_list_free(ids);
      node = isl_schedule_node_child(node, 0);
    }
    node = isl_schedule_node_child(node, 0);
  }
  node = autosa_tree_move_up_to_kernel(node);

  /* Add the data transfer statements. */
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, io_level);
  node = add_drain_merge_stmt_acc(node, group, kernel);

  /* Compute the union of domains of all the array references in the group. */
  group_access = isl_union_map_empty(isl_map_get_space(group->access));
  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    group_access = isl_union_map_union(group_access,
                                       autosa_drain_group_ref_access_relation(group, ref, 0, 1,
                                                                              kernel->expanded_domain));
  }
  group_domain = isl_union_map_domain(group_access);
  group_domain = isl_union_set_coalesce(group_domain);
  /* Add the group domain as the filter. */
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_filter(node, group_domain);

  /* Add the func mark. */
  func = autosa_drain_merge_func_alloc(gen);
  id = isl_id_alloc(ctx, "drain_merge", func);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  sched = isl_schedule_node_get_schedule(node);
  func->sched = sched;
  func->group = group;
  func->kernel = kernel;
  func->inst_ids = io_ids;

  isl_schedule_node_free(node);

  return func;
}

struct add_serialize_stmt_acc_data
{
  struct autosa_array_ref_group *group;
  struct autosa_stmt_access *ref;
  struct autosa_kernel *kernel;
  struct autosa_array_tile *local_tile;
  char *stmt_name;
  int read;
  struct autosa_hw_module *module;
};

static __isl_give isl_schedule_node *add_serialize_stmt_acc_single(
    __isl_take isl_schedule_node *node, void *user)
{
  struct add_serialize_stmt_acc_data *data =
      (struct add_serialize_stmt_acc_data *)user;
  struct autosa_array_ref_group *group = data->group;
  struct autosa_stmt_access *ref = data->ref;
  struct autosa_array_tile *tile;
  isl_union_set *uset, *empty_filter, *domain;
  isl_set *set;
  isl_space *space;
  isl_id *id, *id2;
  isl_ctx *ctx;
  isl_union_map *access;
  int empty;
  isl_multi_aff *from_access;
  isl_multi_aff *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_schedule_node *graft;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;

  /* Examine if the statement contains the access. */
  uset = isl_schedule_node_get_domain(node);
  set = isl_set_from_union_set(isl_union_set_copy(uset));
  space = isl_set_get_space(set);
  isl_set_free(set);
  id = isl_space_get_tuple_id(space, isl_dim_set);
  isl_space_free(space);
  space = isl_map_get_space(ref->access);
  id2 = isl_space_get_tuple_id(space, isl_dim_in);
  empty_filter = isl_union_set_empty(isl_union_set_get_space(uset));
  isl_union_set_free(uset);
  isl_space_free(space);
  if (id = id2)
  {
    isl_id_free(id);
    isl_id_free(id2);
    node = isl_schedule_node_insert_filter(node, empty_filter);
    return node;
  }
  isl_id_free(id);
  isl_id_free(id2);
  ctx = isl_schedule_node_get_ctx(node);

  /* S -> [D -> A] */
  access = io_comm_access_ref(data->kernel, node, group, ref, data->read);  

  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    isl_union_set_free(empty_filter);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return node;
  }

  from_access = autosa_create_io_access_stmt(
      ctx, group, group, data->local_tile,
      isl_schedule_node_get_schedule_depth(node), data->stmt_name);

  /* Create a register tiling. */
  tile = create_register_tiling(node, group, ref);
  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

  domain = isl_union_map_range(access);
  /* Update the serialization bound. */
  group->local_array->serialize_bound = isl_set_card(isl_set_from_union_set(isl_union_set_copy(domain)));

  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  access = isl_union_set_wrapped_domain_map(domain);
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);
  node = isl_schedule_node_insert_filter(node, empty_filter);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_parent(node);

  autosa_array_tile_free(tile);

  return node;
}

static __isl_give isl_schedule_node *add_serialize_stmt_acc(
  __isl_take isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_array_tile *tile,
  char *stmt_name,
  int read,
  struct autosa_hw_module *module)
{
  struct add_serialize_stmt_acc_data data = {
      group, NULL, kernel, tile, stmt_name, read, module};

  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    data.ref = ref;
    node = isl_schedule_node_map_descendant_bottom_up(
        node, &add_serialize_stmt_acc_single, &data);
  }

  return node;
}

static __isl_give isl_schedule_node *add_serialize_stmt_tile(
  __isl_take isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_array_tile *local_tile, /* Local buffer */
  struct autosa_array_tile *tile,       /* Tile to be copied */
  char *stmt_name,
  int read,
  struct autosa_hw_module *module)
{
  isl_union_map *access;
  int empty;
  isl_multi_aff *from_access;
  isl_multi_aff *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_union_set *domain;
  isl_schedule_node *graft;
  isl_ctx *ctx;

  ctx = isl_schedule_node_get_ctx(node);
  access = io_comm_access(kernel, node, group, read);
  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return node;
  }

  from_access = autosa_create_io_access_stmt(kernel->ctx, group, group,
                                             local_tile, isl_schedule_node_get_schedule_depth(node), stmt_name);

  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

  /* [D -> A] */
  domain = isl_union_map_range(access);
  /* Restrain the buffer to the local tile size. */
  if (!autosa_array_is_scalar(group->array))
  {
    isl_map *map;
    isl_set *set;
    set = isl_map_domain(isl_map_from_union_map(isl_union_set_unwrap(domain)));
    map = group_tile_buffer(group, tile);
    map = isl_map_intersect_domain(map, set);
    domain = isl_union_set_from_set(isl_map_wrap(map));
  }

  /* Extract the serialization bound. */
  group->local_array->serialize_bound = isl_set_card(
      isl_set_from_union_set(isl_union_set_copy(domain)));  

  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  access = isl_union_set_wrapped_domain_map(domain);
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  if (group->local_array->is_sparse) {
    /* We will need to modify the last dimension accordingly. */
    int n = isl_schedule_node_band_n_member(graft);
    if (n > 1) {
      graft = isl_schedule_node_band_split(graft, n - 1);
      graft = isl_schedule_node_child(graft, 0);
    }
    if (group->local_array->eff_compress_ratio > 1) {
      int tile_size[1];
      isl_union_set *filter;
      
      tile_size[0] = group->local_array->eff_compress_ratio;
      graft = autosa_tile_band(graft, tile_size);
      graft = isl_schedule_node_child(graft, 0);
      filter = schedule_eq_lb(graft);
      graft = isl_schedule_node_insert_filter(graft, filter);
      graft = isl_schedule_node_parent(graft);
    }
  }

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);

  return node;
}

/* Generate a schedule for serializing/deserializing the host data.
 */
static __isl_give isl_schedule *generate_serialize_schedule(
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group,
    struct autosa_hw_module *module,
    struct autosa_gen *gen,
    int in)
{
  isl_printer *p;
  isl_schedule_node *node;
  isl_ctx *ctx;
  struct autosa_io_buffer *buf;
  int io_level, i;
  char *stmt_name;
  isl_union_set *empty_filter;
  isl_union_map *group_access;
  isl_union_set *group_domain;
  isl_id *id;
  isl_schedule *sched;
  isl_union_set *group_core = NULL;

  ctx = gen->ctx;
  if (gen->options->autosa->lower_int_io_L1_buffer && group->io_L1_lower_schedule)
    node = isl_schedule_get_root(group->io_L1_lower_schedule);
  else
    node = isl_schedule_get_root(group->io_schedule);
  node = autosa_tree_move_down_to_kernel(node);

  /* Compute the union of domains of all the array references in the group. */
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, in, 0, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, in);
  node = isl_schedule_node_child(node, 0);
  group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));
  node = autosa_tree_move_up_to_kernel(node);

  /* Generate the statement */
  p = isl_printer_to_str(ctx);
  p = isl_printer_print_str(p, in ? "serialize" : "deserialize");
  stmt_name = isl_printer_get_str(p);
  isl_printer_free(p);

  io_level = module->level;
  /* Locate the next buffer. */
  for (i = io_level; i >= 1; i--)
  {
    buf = group->io_buffers[i - 1];
    if (buf->tile != NULL)
      break;
  }
  /* Move the schedule node to the level of the buffer.
   * TODO: fix it when the buf->tile == NULL.
   */
  node = autosa_tree_move_down_to_depth(node, buf->tile->depth, group_core);
  if (!buf->tile)
  {
    /* If there is more than one reference in the I/O group to be serialized.
     * We will disable the serialization for this module.
     */
    if (group->n_ref > 1)
    {
      isl_schedule_node_free(node);
      return NULL;
    }
    else
    {
      node = add_serialize_stmt_acc(node, group, kernel, buf->tile, stmt_name, in, module);
    }
  }
  else
  {
    node = add_serialize_stmt_tile(node, group, kernel, buf->tile, buf->tile, stmt_name, in, module);
    node = isl_schedule_node_cut(node);
    empty_filter = isl_union_set_from_set(isl_set_empty(isl_set_get_space(kernel->context)));
    node = isl_schedule_node_insert_filter(node, empty_filter);
  }
  free(stmt_name);

  /* Add the host_serialize mark. */
  id = isl_id_alloc(ctx, "host_serialize", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  /* Update the array information */
  group->local_array->host_serialize = 1;

  sched = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);
  isl_union_set_free(group_core);

  return sched;
}

/* This function recalculates the bound of io module ids for the io module.
 * We will insert a filter that equals the io id to the 
 * sched dim at each dimension.
 * Then we will compute the domain of these io ids and use them to update the 
 * io schedule context.
 * The node points to "array".
 */
static __isl_give isl_schedule_node *update_io_module_context(
  __isl_take isl_schedule_node *node,
  struct autosa_gen *gen,
  int io_level, int n_io_ids)
{
  isl_union_set *domain;
  isl_ctx *ctx;
  isl_set *grid;
  isl_schedule_node *tmp_node;
  isl_id_list *io_ids;
  isl_set *context;

  ctx = isl_schedule_node_get_ctx(node);
  tmp_node = isl_schedule_node_copy(node);

  /* Add io ids filters down to the io_level */
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  tmp_node = add_io_ids_filter(tmp_node, io_ids, 1, n_io_ids, 0, 0, 0);
  
  /* Collect the domain down to the io_level */
  domain = isl_schedule_node_get_domain(tmp_node);
  grid = isl_union_set_params(domain);
  grid = isl_set_from_params(grid);

  isl_id_list_free(io_ids);
  isl_schedule_node_free(tmp_node);

  /* Update the context. */
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context
  context = isl_schedule_node_context_get_context(node);  
  context = isl_set_intersect(context, grid);
  context = isl_set_coalesce(context);

  node = isl_schedule_node_delete(node);
  node = isl_schedule_node_insert_context(node, context);

  return node;
}

/* Generate the schedule for the I/O module.  
 * We will insert statements at the corresponding position in the schedule tree
 * to transfer the data.
 * The statement is in the format of:
 * in_trans/out_trans[_dram]/[_dram_serialize]/[_boundary].fifo_suffix[_local].
 * is_filter.is_buffer.filte_depth.filter_dim.buf_cur_lane.buf_nxt_lane.coalesce_depth.coalesce_ub
 * 
 * If is_buffer is disabled, we will insert one I/O statement for 
 * transferring the data between the same-level I/O modules and lower-level modules.
 * If is_buffer is enabled, we will insert two I/O statements:
 * - one for transaferring the data between the same-level I/O modules and store
 *   the data required for the lower-level I/O modules in the buffers.
 * - one for transaferring the data to/from the lower-level I/O modules from/to 
 *   the local buffers.
 * If host data serialization is enabled, we will generate a separate schedule 
 * for serializing/deserializing the host data.
 */
static isl_stat generate_default_io_module_schedule(
  __isl_take struct autosa_hw_module *module,
  __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_gen *gen,
  int io_level, int space_dim,
  int is_filter, int is_buffer,
  int read, int boundary)
{
  isl_schedule *sched1, *sched2;
  isl_ctx *ctx;
  isl_printer *p;
  char *io_mark;
  int n_io_ids = 0;
  isl_id_list *io_ids;
  isl_id *id;
  int is_mark;
  isl_set *context;
  char *fifo_suffix, *buf_suffix;
  isl_union_set *empty_filter = NULL;
  isl_union_set *eq_filter = NULL;
  isl_union_set *neq_filter = NULL;
  int depth;
  char *stmt_name;
  struct autosa_io_buffer *buf = NULL;
  int i;
  isl_union_set *id_filter;
  isl_union_set *group_core = NULL;

  ctx = isl_schedule_node_get_ctx(node);
  sched1 = isl_schedule_node_get_schedule(node);
  sched2 = isl_schedule_dup(sched1);
  isl_schedule_free(sched1);
  node = isl_schedule_get_root(sched2);
  isl_schedule_free(sched2);  

  /* Compute the union of domains of all the array references in the group. */
  node = autosa_tree_move_down_to_kernel(node);
  node = isl_schedule_node_child(node, 0); // context
  node = isl_schedule_node_child(node, 0);
  if (gen->options->autosa->local_reduce && group->attached_drain_group)
    node = insert_io_group_access_domain_local_reduce(node, group, kernel, read, 0, 1);
  else
    node = insert_io_group_access_domain(node, group, kernel, read);  
  node = isl_schedule_node_child(node, 0);
  group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));    
  node = autosa_tree_move_up_to_kernel(node);  

  /* Add the module id filters. */
  n_io_ids = space_dim - io_level + 1;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");   
  node = autosa_tree_move_down_to_array(node, kernel->core);  
  node = add_io_ids_filter(node, io_ids, io_level, space_dim - io_level + 1, is_filter, module->to_pe, read);
  node = autosa_tree_move_up_to_kernel(node);    

  /* Add the data transfer statements. */  
  init_suffix(module, group, &fifo_suffix, &buf_suffix);  
  /* Locate the next buffer. */
  for (i = io_level; i >= 1; i--)
  {
    buf = group->io_buffers[i - 1];
    if (buf->tile != NULL)
      break;
  }
  if (is_buffer)
  {
    if (i != io_level)
    {
      /* The buffer is optimized out at this level. */
      is_buffer = 0;
    }
  }

  if (buf->tile && buf->hoist_depth != -1) {
    /* This buffer has been hoisted. */    
    node = isl_schedule_node_child(node, 0); // context
    node = isl_schedule_node_child(node, 0); // last inserted filter
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_filter(node, isl_union_set_copy(buf->hoist_domain));
    node = isl_schedule_node_child(node, 0);
    isl_union_set_free(group_core);
    group_core = isl_union_set_universe(isl_schedule_node_get_domain(node));    
    node = autosa_tree_move_up_to_kernel(node);
  }

  /* Move the schedule node to the level of the buffer. 
   * In the current implementation, there will also be a buffer at the 
   * innermost level.
   */
  if (is_filter) {
    module->data_pack_inter = buf->n_lane;
    module->data_pack_intra = buf->n_lane;
    node = insert_filter_trans_stmts(
              node, io_ids, space_dim - io_level, io_level, read,
              buf, module, kernel, gen, boundary, 1, is_buffer, fifo_suffix, group, group_core, 0);
  } else {
    if (is_buffer) {
      /* Insert two statements:
       * - Load from upper stream I/O modules/DRAM to buffer
       * - Write to downstream I/O modules from buffer
       */
      module->data_pack_inter = buf->n_lane;
      /* Locate the next buffer after the current buffer. */
      int cur_level = buf->level;
      struct autosa_io_buffer *cur_buf = buf;
      for (int i = cur_level - 1; i >= 1; i--)
      {
        buf = group->io_buffers[i - 1];
        if (buf->tile != NULL)
          break;
      }

      if (!buf->tile) {
        module->data_pack_intra = group->n_lane;        
      } else {
        module->data_pack_intra = buf->n_lane;
      }
      
      /* Insert the first statement. */
      node = autosa_tree_move_down_to_depth(node, cur_buf->tile->depth, kernel->core);
      p = isl_printer_to_str(ctx);
      p = print_io_trans_stmt_prefix(
              p, read, module->to_mem, gen->options->autosa->host_serialize, boundary, 0, NULL,
              0, 0, is_buffer, fifo_suffix, cur_buf->n_lane);
      node = insert_io_stmts_tile(node, cur_buf->n_lane, p, kernel, group, 
              cur_buf, cur_buf, read, is_buffer, module, 0, 0, -1);
            
      /* Insert the second statement. */
      p = isl_printer_to_str(ctx);
      p = print_io_trans_stmt_prefix(
              p, !read, 0, gen->options->autosa->host_serialize, boundary, 0, NULL,
              !read, read, is_buffer, fifo_suffix, cur_buf->n_lane);
      if (module->to_pe || !buf->tile) {
        node = insert_io_stmts_acc(
                  node, group->n_lane, p, kernel, group, cur_buf, read, is_buffer, module, 0);
      } else {
        node = autosa_tree_move_down_to_io_mark(node, group_core, buf->level);
        node = isl_schedule_node_child(node, 0);        
        node = insert_io_stmts_tile(node, buf->n_lane, p, kernel, group, 
                  cur_buf, buf, read, is_buffer, module, 1, 0, -1);
      }
    } else {
      /* Insert one statement.
       * Load from upper stream I/O modules/DRAM and write to
       * downstream I/O modules.
       */
      if (buf->tile) {
        int pe_depth;        
        isl_schedule_node *node_tmp;

        module->data_pack_inter = group->io_buffers[io_level - 1]->n_lane;
        module->data_pack_intra = buf->n_lane;

        node_tmp = isl_schedule_node_copy(node);
        node_tmp = autosa_tree_move_down_to_pe(node_tmp, kernel->core);
        pe_depth = isl_schedule_node_get_schedule_depth(node_tmp);
        isl_schedule_node_free(node_tmp);
        if (pe_depth == buf->tile->depth) {
          node = autosa_tree_move_down_to_pe(node, kernel->core);
        } else if (pe_depth > buf->tile->depth){
          node = autosa_tree_move_down_to_depth(node, buf->tile->depth, kernel->core);
        } else {
          node = autosa_tree_move_up_to_kernel(node);
          node = autosa_tree_move_down_to_depth(node, buf->tile->depth, kernel->core);
        }        
        p = isl_printer_to_str(ctx);
        p = print_io_trans_stmt_prefix(
              p, read, module->to_mem, gen->options->autosa->host_serialize, boundary, 0, NULL,
              !read, read, is_buffer, fifo_suffix, module->data_pack_inter);
        node = insert_io_stmts_tile(node, module->data_pack_intra, p, kernel, group, 
                  group->io_buffers[io_level - 1], buf, read, is_buffer, module, 1, 0, -1);
      } else {
        module->data_pack_inter = group->n_lane;
        module->data_pack_intra = group->n_lane;

        p = print_io_trans_stmt_prefix(
                p, read, module->to_mem, gen->options->autosa->host_serialize, boundary, 0, NULL,
                !read, read, is_buffer, fifo_suffix, group->n_lane);
        node = insert_io_stmts_acc(node, group->n_lane, p, kernel, group, NULL, read, is_buffer, module, 0);
      }
    }
  }

  free(fifo_suffix);
  free(buf_suffix);
  isl_union_set_free(group_core);

  /* Add the module mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  if (gen->options->autosa->tuning_method == 1 && !boundary) {
    isl_schedule *orig_sched = isl_schedule_node_get_schedule(node);
    module->tuning_sched = kernel->tuning_program->generate_tuning_schedule(isl_schedule_dup(orig_sched));

    isl_schedule *tuning_sched = isl_schedule_dup(orig_sched);
    isl_schedule_free(orig_sched);
    /* Remove module filters. */
    isl_schedule_node *root = isl_schedule_get_root(tuning_sched);
    isl_schedule_free(tuning_sched);    
    root = autosa_tree_move_down_to_io_mark(root, kernel->core, io_level);
    while (isl_schedule_node_has_parent(root)) {
      root = isl_schedule_node_parent(root);
      if (isl_schedule_node_get_type(root) == isl_schedule_node_filter) {
        root = isl_schedule_node_delete(root);
      }
      if (autosa_tree_node_is_mark(root, "array"))
        break;
    }    
    tuning_sched = isl_schedule_node_get_schedule(root);
    isl_schedule_node_free(root);
    module->tuning_num_sched = kernel->tuning_program->generate_tuning_schedule(tuning_sched);
  }

  sched1 = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  if (!boundary)
  {
    module->sched = sched1;
    module->type = (group->group_type == AUTOSA_DRAIN_GROUP) ? DRAIN_MODULE : IO_MODULE;
    module->level = io_level;
    module->n_io_group++;
    module->io_groups = (struct autosa_array_ref_group **)realloc(module->io_groups,
                                                                  module->n_io_group * sizeof(struct autosa_array_ref_group *));
    module->io_groups[module->n_io_group - 1] = group;
    module->inst_ids = io_ids;
    module->kernel = kernel;
    module->is_buffer = is_buffer;
    module->is_filter = is_filter;
    /* Create IO module variables. */
    if (is_buffer)
    {
      for (int i = io_level; i >= 1; i--)
      {
        buf = group->io_buffers[i - 1];
        if (buf->tile != NULL)
          break;
      }
      create_io_module_vars(module, kernel, buf->tile, 0);
    }
  }
  else
  {
    isl_id_list_free(io_ids);
    module->boundary_sched = sched1;
  }

  return isl_stat_ok;
}

/* Generate the default I/O module when either is_filter or is_buffer is zero.
 */
static __isl_give struct autosa_hw_module *generate_default_io_module(
    __isl_take struct autosa_hw_module *module, __isl_keep isl_schedule_node *node,
    struct autosa_array_ref_group *group, struct autosa_kernel *kernel,
    struct autosa_gen *gen,
    int io_level, int space_dim, int is_filter, int is_buffer, int read)
{
  isl_ctx *ctx = gen->ctx;

  generate_default_io_module_schedule(module, node, group,
                                      kernel, gen, io_level, space_dim, is_filter, is_buffer, read, 0);

  if (is_filter)
  {
    /* Add the boundary module schedule. */
    module->boundary = 1;
    generate_default_io_module_schedule(module, node, group,
                                        kernel, gen, io_level, space_dim, is_filter, is_buffer, read, 1);
  }

  return module;
}

/* Generate the I/O modules for transffering the data.
 * The I/O module is decribed by two features:
 * - is_filter: If the module is a filter node, it will keep the data 
 *   that belongs to it and sends to the lower-level I/O modules or PEs. 
 *   Else, it will simply pass the data to downstream modules.
 * - is buffer: If the module is buffered. We will allocate a local buffer 
 *   inside the module.
 */
static __isl_give struct autosa_hw_module *generate_io_module_by_type(
    __isl_take struct autosa_hw_module *module, __isl_keep isl_schedule_node *node,
    struct autosa_array_ref_group *group, struct autosa_kernel *kernel,
    struct autosa_gen *gen, int io_level, int space_dim,
    int is_filter, int is_buffer, int read)
{
  if (is_filter && is_buffer)
  {
    module = generate_filter_buffer_io_module(module, node, group, kernel,
                                              gen, io_level, space_dim, is_filter, is_buffer, read);
  }
  else
  {    
    module = generate_default_io_module(module, node, group, kernel,
                                        gen, io_level, space_dim, is_filter, is_buffer, read);
  }

  return module;
}

/* This function updates the data pack factors for I/O modules that access
 * the external DRAM. The module data should also be serialized.
 */
static int update_serialize_data_pack(struct autosa_gen *gen, struct autosa_hw_module *module)
{
  isl_union_map *sizes;
  int *data_pack_ubs = NULL;
  int dram_limit = 64; // bytes
  int ele_size = module->io_groups[0]->array->size;
  int n_lane = module->data_pack_inter;
  int host_pack = -1;

  sizes = extract_sizes_from_str(gen->ctx, module->options->autosa->data_pack_sizes);  
  data_pack_ubs = read_data_pack_sizes_array(sizes, module->io_groups[0]->array->name);
  if (data_pack_ubs) 
    dram_limit = data_pack_ubs[2];
  free(data_pack_ubs);
  isl_union_map_free(sizes);

  if (module->io_groups[0]->local_array->is_sparse) {
    /* Extract the sparse information */
    int n_nzero = module->io_groups[0]->local_array->n_nzero;
    int n_meta_data = module->io_groups[0]->local_array->n_meta_data;
    for (int limit = dram_limit; limit >= (ele_size * n_lane * (n_nzero + n_meta_data)); limit -= (ele_size * n_lane * (n_nzero + n_meta_data))) {
      if (limit % (ele_size * n_lane * (n_nzero + n_meta_data)) == 0 &&
          module->coalesce_bound % (limit / (ele_size * n_lane * (n_nzero + n_meta_data))) == 0) {
        host_pack = limit / ele_size;
        break;
      }
    }
  } else {    
    isl_printer *p_str = isl_printer_to_str(gen->ctx);
    p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);    
    p_str = isl_printer_print_pw_qpolynomial(p_str, module->io_groups[0]->local_array->serialize_bound);    
    char *serialize_bound = isl_printer_get_str(p_str);
    isl_printer_free(p_str);    
    std::string serialize_bound_str(serialize_bound);    
    int serialize_bound_int = stoi(serialize_bound_str);    
    free(serialize_bound);

    for (int limit = dram_limit; limit >= ele_size * n_lane; limit -= ele_size * n_lane) 
    {
      /* Limit should be a power of two. */
      if (log2f((float)limit) != int(log2f((float)limit)))
        continue;
      //if (limit % (ele_size * n_lane) == 0 && module->coalesce_bound % (limit / (ele_size * n_lane)) == 0)
      if (limit % (ele_size * n_lane) == 0 && serialize_bound_int % (limit / (ele_size * n_lane)) == 0)
      {
        host_pack = limit / ele_size;
        break;
      }
    }
  }

  return host_pack != -1? host_pack : module->data_pack_intra;
}

/* This function builds a set of I/O modules for each I/O group.
 * We will first examine if any flow dependence that is associated with the 
 * current group is carried by the array part loops. 
 * In that case, credit control should be added to force the dependece.
 * TODO: to be implemented.
 * Next, we will generate the copy-in set and copy-out set of I/O modules for 
 * the I/O groups. At each I/O level, we generate one I/O module.
 * We apply the I/O module pruning by default here.
 * Specifically, if the copy-out set at the current array_part loops equals 
 * the copy-in set at of the next array_part loops, there is no need to generate
 * to go off-chip, we will prune away such I/O modules.
 * If the I/O group has interior I/O at the PE level, the data required for the 
 * next iteration should reside in the PEs.
 * Otherwise, we will connect the copy-out I/O modules to the copy-in I/O modules,
 * and buffer the data on-chip. (TODO: not supported yet.)
 */
static __isl_give struct autosa_hw_module **sa_io_module_gen(
    struct autosa_array_ref_group *group,
    struct autosa_gen *gen, int *n_modules, int in, int out)
{  
  isl_schedule_node *node;
  isl_ctx *ctx;
  struct autosa_kernel *kernel;
  int space_dim;
  int io_level;
  struct autosa_hw_module **modules = NULL;
  int module_cnt = 0;
  int credit = 0;

  ctx = gen->ctx;
  if (gen->options->autosa->lower_int_io_L1_buffer && group->io_L1_lower_schedule) 
    node = isl_schedule_get_root(group->io_L1_lower_schedule);
  else
    node = isl_schedule_get_root(group->io_schedule);
  
  io_level = group->io_level;
  space_dim = group->space_dim;  
  kernel = gen->kernel;
  node = autosa_tree_move_down_to_kernel(node);

  /* Test if the deps in this I/O group are carried by array part loops.
   * If so, data hazards are possible, and we will set the credit as true
   * so that we could enable credit control between read and write I/O modules to 
   * prevent the data hazards. 
   * TODO: This is not supported yet.
   */
  if (gen->options->autosa->credit_control)
  {
    if (is_flow_dep_carried_by_array_part_loops(group->io_schedule, group, kernel))
      credit = 1;
  }

  /* At each I/O level, generate one I/O module. */
  /* Copy-in group. */  
  if (in && group->copy_in)
  {    
    for (int i = io_level; i >= 1; i--)
    {
      struct autosa_hw_module *module;
      char *module_name = NULL;
      char *io_mark = NULL;
      isl_printer *p_str;
      int is_filter;
      int is_buffer;
      int innermost, outermost;

      /* Classify the module type. */
      outermost = io_level;
      if (group->io_type == AUTOSA_INT_IO)
        innermost = 1;
      else
        innermost = 2; // IO_L1 is integrated into PEs. No need to generate.

      /* Since we perform I/O clustering automatically, all the I/O modules
       * except the outermost level will be in the filter mode:
       * which means that they will pass data to downstream modules
       * and filter out the data that they need for the lower-level modules
       * they are connected to.
       */  
      if (i == outermost && outermost != innermost) {
        is_filter = 0;
        if (gen->options->autosa->lower_int_io_L1_buffer) {
          is_filter = 1;
        }
      } else
        is_filter = 1;
      
      /* All the innermost modules will be buffered to isolate the computation 
       * and data communication. Otherwise, possible data hazards might cause 
       * the design to stuck.
       */
      if (i == innermost) 
        is_buffer = 1;
      else
        is_buffer = 0;

      if (gen->options->autosa->two_level_buffer)
      {
        /* When two-level buffering is enabled, 
         * we will implement a second-level buffe at the outermost I/O module.
         */
        if (i == outermost)
          is_buffer = 1;
      }
      if (gen->options->autosa->lower_int_io_L1_buffer)
      {
        if (i == outermost) 
          is_buffer = group->io_buffers[outermost - 1]->tile? 1 : 0;
      }      

      /* Generate the I/O module */
      if (i >= innermost && i <= outermost)
      {
        module = autosa_hw_module_alloc(gen);
        module_name = generate_io_module_name(ctx, group, i, 1);
        module->name = module_name;
        module->to_pe = (i == innermost) ? 1 : 0;
        module->to_mem = (i == outermost) ? 1 : 0;
        module->credit = (i == outermost) ? credit : 0;
        module->n_array_ref = group->local_array->n_io_group_refs;
        module->in = 1;
        module->is_serialized = (gen->options->autosa->host_serialize && module->to_mem) ? 1 : 0;
        if (module->to_mem)
        {
          /* Create the group_ref and mem_port mapping. */
          for (int p = 0; p < group->n_mem_ports; p++)
          {
            int group_ref_offset = group->local_array->n_io_group_refs;
            int mem_port_offset = group->mem_port_id;                 
            group->local_array->group_ref_mem_port_map.push_back(group_ref_offset + p);
            group->local_array->group_ref_mem_port_map.push_back(mem_port_offset + p);
          }
          group->local_array->n_io_group_refs += group->n_mem_ports;
        }

        module = generate_io_module_by_type(module, node, group, kernel,
                                            gen, i, space_dim, is_filter, is_buffer, 1);
        if (module->is_serialized)
        {
          /* Generate the schedule for serializing/deserializing the host data. */          
          module->serialize_sched = generate_serialize_schedule(
              kernel, group, module, gen, 1);
          if (module->serialize_sched) {
            /* Update the data packing factor. */            
            module->data_pack_serialize = update_serialize_data_pack(gen, module);            
            module->io_groups[0]->local_array->n_lane = module->data_pack_serialize;
            module->io_groups[0]->local_array->array->n_lane = module->data_pack_serialize;
          }
        } else {
          module->is_serialized = 0;
        }

        module_cnt++;
        modules = (struct autosa_hw_module **)realloc(modules,
                                                      module_cnt * sizeof(struct autosa_hw_module *));
        modules[module_cnt - 1] = module;
      }
    }
  }
  
  /* Copy-out group. */  
  if (out && group->copy_out)
  {    
    for (int i = 1; i <= io_level; i++)
    {
      struct autosa_hw_module *module;
      char *module_name = NULL;
      char *io_mark = NULL;
      isl_printer *p_str;
      int is_filter;
      int is_buffer;
      int innermost, outermost;

      /* Classify the module type. */
      outermost = io_level;
      if (group->io_type == AUTOSA_INT_IO)
        innermost = 1;
      else
        innermost = 2; // IO_L1 is integrated into PEs.

      if (i == outermost && outermost != innermost)
        is_filter = 0;
      else
        is_filter = 1;
      
      if (i == innermost) 
        is_buffer = 1;
      else
        is_buffer = 0;

      if (gen->options->autosa->two_level_buffer)
      {
        /* When two-level buffering is enabled, 
         * we will implement a second-level buffer at the outermost I/O module.
         */
        if (i == outermost)
          is_buffer = 1;
      }

      /* Generate the I/O module. */
      if (i >= innermost && i <= outermost)
      {
        module = autosa_hw_module_alloc(gen);
        module_name = generate_io_module_name(ctx, group, i, 0);
        module->name = module_name;
        module->to_pe = (i == innermost) ? 1 : 0;
        module->to_mem = (i == outermost) ? 1 : 0;
        module->credit = (i == outermost) ? credit : 0;
        module->n_array_ref = group->local_array->n_io_group_refs;
        module->in = 0;
        module->is_serialized = (gen->options->autosa->host_serialize && module->to_mem) ? 1 : 0;
        if (module->to_mem)
        {
          /* Create the group_ref and mem_port mapping. */
          for (int p = 0; p < group->n_mem_ports; p++)
          {
            int group_ref_offset = group->local_array->n_io_group_refs;
            int mem_port_offset = group->mem_port_id;                        
            group->local_array->group_ref_mem_port_map.push_back(group_ref_offset + p);
            group->local_array->group_ref_mem_port_map.push_back(mem_port_offset + p);
          }
          group->local_array->n_io_group_refs += group->n_mem_ports;
        }
        
        module = generate_io_module_by_type(module, node, group, kernel,
                                            gen, i, space_dim, is_filter, is_buffer, 0);
        if (module->is_serialized)
        {
          /* Generate the schedule for serializing/deserializing the host data. */          
          module->serialize_sched = generate_serialize_schedule(
              kernel, group, module, gen, 0);
          if (module->serialize_sched) {
            /* Update the data packing factor. */
            module->data_pack_serialize = update_serialize_data_pack(gen, module);            
            module->io_groups[0]->local_array->n_lane = module->data_pack_serialize;
            module->io_groups[0]->local_array->array->n_lane = module->data_pack_serialize;
          }            
        } else {
          module->is_serialized = 0;
        }

        module_cnt++;
        modules = (struct autosa_hw_module **)realloc(modules,
                                                      module_cnt * sizeof(struct autosa_hw_module *));
        modules[module_cnt - 1] = module;
      }
    }
  }

  isl_schedule_node_free(node);
  *n_modules = module_cnt;
  return modules;
}

/* If the band node "node" has more than "n" members, then split off
 * the first "n" of them.
 */
static __isl_give isl_schedule_node *split_band(
    __isl_take isl_schedule_node *node, int n)
{
  int dim;

  dim = isl_schedule_node_band_n_member(node);
  if (n < dim)
    node = isl_schedule_node_band_split(node, n);

  return node;
}

/* Compute the effective sa size as a list of the sizes in each dimension.
 *
 * The sa size specified by the user or set by default
 * in read_array_part_tile_sizes() and applied by the PE filter,
 * may be too large for the given code in the sense that
 * it may contain PEs that don't need to execute anything.
 * We therefore don't return this sa size, but instead the
 * smallest grid size that ensures that all blocks that actually
 * execute code are included in the grid.
 *
 * We first extract a description of the grid, i.e., the possible values
 * of the PE ids, from the domain elements in "domain" and
 * kernel->pe_filter.
 * The PE ids are parameters in kernel->pe_filter.
 * We simply need to change them into set dimensions.
 *
 * Then, for each PE dimension, we compute the maximal value of the PE id
 * and add one.
 */
static __isl_give isl_multi_pw_aff *extract_sa_grid_size(
    struct autosa_kernel *kernel, __isl_take isl_union_set *domain)
{
  int i;
  isl_set *grid;
  isl_set *context;
  isl_multi_pw_aff *size;

  domain = isl_union_set_intersect(domain,
                                   isl_union_set_copy(kernel->pe_filter));

  grid = isl_union_set_params(domain);
  grid = isl_set_from_params(grid);
  grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_sa_dim);

  for (i = 0; i < kernel->n_sa_dim; ++i)
  {
    int pos;
    isl_id *id;

    if (!grid)
      return NULL;

    id = isl_id_list_get_id(kernel->pe_ids, i);
    pos = isl_set_find_dim_by_id(grid, isl_dim_param, id);
    isl_id_free(id);
    if (pos < 0)
      isl_die(isl_set_get_ctx(grid), isl_error_internal,
              "missing constraints on PE identifier",
              grid = isl_set_free(grid));
    grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
    grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
  }

  grid = isl_set_coalesce(grid);
  size = ppcg_size_from_extent(grid);
  context = isl_set_params(isl_set_copy(kernel->context));
  return isl_multi_pw_aff_gist(size, context);
}

/* Internal struct for add_pe_ext_io_copies. */
struct autosa_add_pe_ext_io_copies_data
{
  struct autosa_kernel *kernel;
  struct autosa_array_ref_group *pe_group;
  struct autosa_array_ref_group *io_group;
  struct autosa_stmt_access *ref;
  int read;
  int in; /* I/O direction */
  int dummy;
  int reduce;
  isl_union_set *filter;
};

/* Find the PE group that contains the reference "ref" from the IO group.
 */
static struct autosa_array_ref_group *autosa_find_pe_group(
    struct autosa_local_array_info *local_array,
    struct autosa_array_ref_group *io_group,
    struct autosa_stmt_access *ref)
{
  /* As all accesses from the array are merged together for internal array,
   * simply return the first PE group. 
   */
  if (local_array->array_type == AUTOSA_INT_ARRAY)
    return local_array->pe_groups[0];

  for (int i = 0; i < local_array->n_pe_group; i++)
  {
    struct autosa_array_ref_group *pe_group = local_array->pe_groups[i];
    if (pe_group->refs[0] == ref)
      return pe_group;
  }

  return NULL;
}

/* Given a schedule node "node" of the type "isl_schedule_node_leaf", 
 * we will test if it is under any extension node.
 * If so, we will then test if the current node intersect with the extension domain. 
 */
static isl_bool leaf_node_is_extended(__isl_keep isl_schedule_node *node)
{
  isl_schedule_node *node_e;
  isl_schedule_node *node_f;
  isl_union_set *filter;
  isl_union_map *extension;
  isl_union_set *extension_range;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return isl_bool_error;

  node_e = isl_schedule_node_copy(node);
  node_f = isl_schedule_node_copy(node);

  while (node_e && isl_schedule_node_has_parent(node_e))
  {
    if (isl_schedule_node_get_type(node_e) == isl_schedule_node_extension)
      break;
    node_e = isl_schedule_node_parent(node_e);
  }

  if (node_e == NULL || isl_schedule_node_get_type(node_e) != isl_schedule_node_extension)
  {
    isl_schedule_node_free(node_e);
    isl_schedule_node_free(node_f);
    return isl_bool_false;
  }

  extension = isl_schedule_node_extension_get_extension(node_e);

  while (node_f && isl_schedule_node_has_parent(node_f))
  {
    if (isl_schedule_node_get_type(node_f) == isl_schedule_node_filter)
      break;
    node_f = isl_schedule_node_parent(node_f);
  }

  filter = isl_schedule_node_filter_get_filter(node_f);
  extension_range = isl_union_map_range(extension);
  filter = isl_union_set_intersect(filter, extension_range);
  isl_schedule_node_free(node_e);
  isl_schedule_node_free(node_f);
  if (isl_union_set_is_empty(filter))
  {
    isl_union_set_free(filter);
    return isl_bool_false;
  }

  isl_union_set_free(filter);
  return isl_bool_true;
}

/* Insert data transfer statements beside the program statements. 
 * If the statement is under the SIMD loop, the data transfer statements 
 * are inserted before/after the SIMD loop. 
 * Otherwise, it is inserted before/after the statement.
 */
__isl_give isl_schedule_node *add_pe_ext_io_copies_stmt(
    __isl_take isl_schedule_node *node, void *user)
{
  struct autosa_add_pe_ext_io_copies_data *data =
      (struct autosa_add_pe_ext_io_copies_data *)(user);
  isl_union_set *domain;
  isl_space *space;
  isl_space *acc_space;
  isl_id *id;
  isl_union_map *access;
  int empty;
  isl_multi_aff *from_access;
  isl_ctx *ctx;
  isl_schedule_node *graft;
  isl_multi_aff *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  struct autosa_array_ref_group *pe_group = data->pe_group;
  struct autosa_array_ref_group *io_group = data->io_group;
  struct autosa_array_tile *tile;
  int read = data->read;
  isl_union_map *sched;
  isl_union_map *ref_access;
  isl_map *acc;
  isl_bool ok;
  int is_simd;
  isl_printer *p_str;
  char *stmt_name;
  isl_union_set *empty_filter;
  int n_lane = io_group->n_lane;

  /* Test if the current stmt contains the reference. */
  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;

  /* Test if the node is under any extension node and if the 
   * node is extended by the extension node. 
   */
  if (!leaf_node_is_extended(node))
  {
    isl_set *set;
    isl_id *new_id;
    domain = isl_schedule_node_get_domain(node);
    set = isl_set_from_union_set(domain);
    space = isl_set_get_space(set);
    isl_set_free(set);
    id = isl_space_get_tuple_id(space, isl_dim_set);
    isl_space_free(space);
    acc_space = isl_map_get_space(data->ref->access);
    new_id = isl_space_get_tuple_id(acc_space, isl_dim_in);
    if (id != new_id)
    {
      isl_space_free(acc_space);
      isl_id_free(id);
      isl_id_free(new_id);

      /* Insert empty filter for dummy module. */
      if (data->dummy)
      {
        empty_filter = isl_union_set_from_set(
            isl_set_empty(isl_set_get_space(data->kernel->context)));
        node = isl_schedule_node_insert_filter(node, empty_filter);
      }
      return node;
    }
    isl_id_free(id);
    isl_id_free(new_id);
    isl_space_free(acc_space);
  }
  else
  {
    /* Simply return for the extension nodes. */
    return node;
  }

  ctx = isl_schedule_node_get_ctx(node);
  tile = NULL;
  /* Examine if there is any SIMD mark above. */
  is_simd = is_node_under_simd(node);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, ctx);
//#endif

  /* Aggregate the copy-in/out access
   * S -> [D -> A]
   * S: statement domain elements
   * D: prefix schedule dimensions
   * A: access
   */
  if (is_simd)
  {
    /* We will insert the statements before/after the SIMD loop. */
    if (data->dummy)
    {
      isl_union_set *empty_filter;
      empty_filter = isl_union_set_from_set(isl_set_empty(
          isl_set_get_space(data->kernel->context)));
      node = isl_schedule_node_insert_filter(node, empty_filter);
    }
    node = autosa_tree_move_up_to_mark(node, "simd");
  }
  access = io_comm_access_ref(data->kernel, node, io_group, data->ref, read);
  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return autosa_tree_move_up_to_kernel(node);
  }

  if (data->dummy)
  {
    data->filter = isl_schedule_node_get_domain(node);
  }

  //pe_group->array->global = 1;
  //pe_group->local_array->global = 1;

  /* read.fifoX[D -> A] -> [D -> A] */
  p_str = isl_printer_to_str(ctx);
  if (data->dummy)
    p_str = print_io_stmt_prefix(p_str, data->in, data->dummy, data->reduce, io_group);  
  else
    p_str = print_io_stmt_prefix(p_str, read, data->dummy, 0, io_group);
  
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  from_access = autosa_create_io_access_stmt(ctx, pe_group, io_group,
                                             autosa_array_ref_group_tile(pe_group),
                                             isl_schedule_node_get_schedule_depth(node), stmt_name);
  free(stmt_name);

  /* Create a register tiling. */
  tile = create_register_tiling(node, pe_group, data->ref);
  /* [D -> A] -> T */
  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  /* read.fifoX[D -> A] -> T */
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);
  /* [D -> A] */
  domain = isl_union_map_range(access);
  /* read.fifoX[D -> A] */
  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  /* read.fifoX[D -> A] -> D */
  access = isl_union_set_wrapped_domain_map(domain);
  /* D -> read.fifoX[D -> A] */
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

//#ifdef _DEBUG
//  DBGUMAP(stdout, access, ctx);
//#endif

  graft = isl_schedule_node_from_extension(access);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, graft, ctx);
//  DBGMUPA(stdout, mupa, ctx);
//#endif  
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  /* Modify the n_lane for the sparse data */
  if (io_group->local_array->is_sparse) {
    n_lane *= (io_group->local_array->compress_ratio * io_group->local_array->n_nzero);
  }

  if (n_lane > 1)
  {
    /* Perform data packing. */
    int n_index;
    int tile_size[1];
    isl_id *id;
    isl_union_map *umap;
    isl_union_set *filter;

    n_index = isl_schedule_node_band_n_member(graft);
    /* Split off the last dimension. */
    if (n_index > 1)
    {
      graft = isl_schedule_node_band_split(graft, n_index - 1);
      graft = isl_schedule_node_child(graft, 0);
    }
    /* Tile the last dimension. */
    tile_size[0] = n_lane;
    graft = autosa_tile_band(graft, tile_size);
    graft = isl_schedule_node_child(graft, 0);
    /* Create a filter. */
    filter = schedule_eq_lb(graft);
    graft = isl_schedule_node_insert_filter(graft, filter);
  }

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  if (read) {
    node = isl_schedule_node_graft_before(node, graft);
  } else {
    node = isl_schedule_node_graft_after(node, graft);
  }

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  if (data->dummy) {
    /* insert an empty filter. */
    empty_filter = isl_union_set_from_set(isl_set_empty(
        isl_set_get_space(data->kernel->context)));
    node = isl_schedule_node_insert_filter(node, empty_filter);
  }

  node = isl_schedule_node_parent(node); // filter
  node = isl_schedule_node_parent(node); // sequence
  node = isl_schedule_node_parent(node); // extension

  autosa_array_tile_free(tile);

  return node;
}

/* The "node" is pointed to the "PE" mark.
 * Add data transfer statements for each array access in the group.
 */
static __isl_give isl_schedule_node *add_pe_ext_io_copies(
    struct autosa_kernel *kernel,
    struct autosa_local_array_info *local_array,
    struct autosa_array_ref_group *io_group,
    __isl_take isl_schedule_node *node, int read)
{
  for (int i = 0; i < io_group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = io_group->refs[i];

    if ((io_group->local_array->array_type == AUTOSA_EXT_ARRAY) ||
       ((io_group->local_array->array_type == AUTOSA_INT_ARRAY) && 
       (read && ref->read) || (!read && ref->write)))
    {
      struct autosa_array_ref_group *pe_group =
          autosa_find_pe_group(local_array, io_group, ref);
      struct autosa_add_pe_ext_io_copies_data data =
          {kernel, pe_group, io_group, ref, read, read, 0, 0, NULL};
      node = isl_schedule_node_map_descendant_bottom_up(node,
                                                        &add_pe_ext_io_copies_stmt, &data);
    }
  }

  return node;
}

/* Add the statements for copy-in/out the data for array references associated with
 * interior I/O.
 * The "node" is pointed to the "PE" mark.
 */
__isl_give isl_schedule_node *add_pe_int_io_copies(
    struct autosa_kernel *kernel,
    struct autosa_local_array_info *local_array,
    struct autosa_array_ref_group *io_group,
    __isl_take isl_schedule_node *node, int read)
{
  struct autosa_array_tile *tile;
  isl_union_map *access;
  isl_schedule_node *graft;
  int empty;
  isl_multi_aff *from_access;
  isl_multi_aff *ma;
  isl_multi_pw_aff *mpa;
  isl_multi_union_pw_aff *mupa;
  isl_union_set *domain;
  struct autosa_array_ref_group *pe_group;
  int n_lane = io_group->n_lane;
  isl_printer *p_str;
  char *stmt_name;
  isl_id *id;

  node = isl_schedule_node_child(node, 0);
  /* For array references with interior I/O, 
   * search for the corresponding PE group. */
  pe_group = autosa_find_pe_group(local_array, io_group, NULL);
  tile = autosa_array_ref_group_tile(pe_group);

  /* Aggregate the copy-in/out access 
   * S -> [D -> A] 
   * S: statement domain elements
   * D: prefix schedule dimensions 
   * A: access */
  access = io_comm_access(kernel, node, io_group, read);
  empty = isl_union_map_is_empty(access);
  if (empty < 0 || empty)
  {
    isl_union_map_free(access);
    if (empty < 0)
      return isl_schedule_node_free(node);
    return autosa_tree_move_up_to_pe(node);
  }

  //pe_group->array->global = 1;
  //pe_group->local_array->global = 1;

  /* read.fifoX[D -> A] -> [D -> A] */
  /* Generate statement name. */
  p_str = isl_printer_to_str(kernel->ctx);
  p_str = print_io_stmt_prefix(p_str, read, 0, 0, io_group);  
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  from_access = autosa_create_io_access_stmt(kernel->ctx, pe_group, io_group,
                                             autosa_array_ref_group_tile(pe_group),
                                             isl_schedule_node_get_schedule_depth(node), stmt_name);
  free(stmt_name);

  /* [D -> A] -> T */
  ma = isl_multi_aff_copy(tile->tiling);
  ma = isl_multi_aff_pullback_multi_aff(ma,
                                        isl_multi_aff_copy(from_access));
  mpa = isl_multi_pw_aff_from_multi_aff(ma);
  /* read.fifoX[D -> A] -> T */
  mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);
  /* [D -> A] */
  domain = isl_union_map_range(access);
  /* If the array is not a scalar, then we copy in/out the entire
   * tile to/from the local memory. 
   */
  if (read && !autosa_array_is_scalar(io_group->array))
  {
    isl_map *map;
    isl_set *set;
    set = isl_map_domain(isl_map_from_union_map(isl_union_set_unwrap(domain)));
    map = group_tile_buffer(io_group, io_group->pe_tile);
    map = isl_map_intersect_domain(map, set);
    domain = isl_union_set_from_set(isl_map_wrap(map));
  }

  /* read.fifoX[D -> A] */
  domain = isl_union_set_preimage_multi_aff(domain, from_access);
  access = isl_union_set_wrapped_domain_map(domain);
  access = isl_union_map_reverse(access);
  access = isl_union_map_coalesce(access);

  graft = isl_schedule_node_from_extension(access);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  if (n_lane > 1)
  {
    /* Perform data packing. */
    int n_index;
    int tile_size[1];
    isl_id *id;
    isl_union_map *umap;
    isl_union_set *filter;

    n_index = isl_schedule_node_band_n_member(graft);
    /* Split off the last dimension. */
    if (n_index > 1)
    {
      graft = isl_schedule_node_band_split(graft, n_index - 1);
      graft = isl_schedule_node_child(graft, 0);
    }
    /* Tile the last dimension. */
    tile_size[0] = n_lane;
    graft = autosa_tile_band(graft, tile_size);
    graft = isl_schedule_node_child(graft, 0);
    /* Create a filter. */
    filter = schedule_eq_lb(graft);
    graft = isl_schedule_node_insert_filter(graft, filter);
    /* Move to the tile loop. */
    graft = isl_schedule_node_parent(graft);
  }

  /* Insert a "pipeline" mark inside the band node. */
  id = isl_id_alloc(kernel->ctx, "hls_pipeline", NULL);
  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_mark(graft, id);
  graft = isl_schedule_node_parent(graft);

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  if (read)
  {
    node = isl_schedule_node_graft_before(node, graft);
  }
  else
  {
    node = isl_schedule_node_graft_after(node, graft);
  }

  node = autosa_tree_move_up_to_pe(node);

  return node;
}

static isl_bool find_latency_mark(__isl_keep isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark)
  {
    isl_id *id;

    id = isl_schedule_node_mark_get_id(node);
    if (!strcmp(isl_id_get_name(id), "latency"))
    {
      isl_id_free(id);
      return isl_bool_false;
    }
    isl_id_free(id);
  }

  return isl_bool_true;
}

/* Insert a "hls_pipeline" mark after the innermost "latency" mark.
 * The loop will be eventually pipelined.
 * The "hls_pipeline" mark is placed under the band node.
 */
static __isl_give isl_schedule_node *insert_pipeline_mark(
    __isl_take isl_schedule_node *node, void *user)
{
  struct autosa_kernel *kernel = (struct autosa_kernel *)user;
  isl_ctx *ctx = kernel->ctx;

  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark)
  {
    isl_id *id;

    id = isl_schedule_node_mark_get_id(node);
    if (!strcmp(isl_id_get_name(id), "latency"))
    {
      /* Examine if there is any latency mark inside the current mark. */
      isl_bool no_inner_latency;
      node = isl_schedule_node_child(node, 0);
      no_inner_latency = isl_schedule_node_every_descendant(node,
                                                            &find_latency_mark, NULL);
      node = isl_schedule_node_parent(node);
      if (no_inner_latency)
      {
        /* Insert the "hls_pipeline" mark below the band node. */
        isl_id *hls_id;
        hls_id = isl_id_alloc(ctx, "hls_pipeline", NULL);
        node = isl_schedule_node_child(node, 0);
        node = isl_schedule_node_child(node, 0);
        node = isl_schedule_node_insert_mark(node, hls_id);

        node = isl_schedule_node_parent(node);
        node = isl_schedule_node_parent(node);
      }
    }
    isl_id_free(id);
  }

  return node;
}

/* Tile the SIMD loop for the sparsity */
static __isl_give isl_schedule_node *tile_simd_sparse(
  __isl_take isl_schedule_node *node, void *user)
{
  struct autosa_kernel *kernel = (struct autosa_kernel *)user;
  isl_ctx *ctx = kernel->ctx;

  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark) {
    isl_id *id;

    id = isl_schedule_node_mark_get_id(node);
    isl_id_free(id);
    if (!strcmp(isl_id_get_name(id), "simd")) {
      isl_union_map *umap;
      isl_union_set *uset, *filter;
      isl_set *set;
      int new_ub = kernel->simd_w / kernel->compress_ratio;

      umap = isl_schedule_node_get_subtree_schedule_union_map(node);
      uset = isl_union_map_range(isl_union_map_copy(umap));
      set = isl_set_from_union_set(uset);
//#ifdef _DEBUG
//      DBGSET(stdout, set, ctx);
//      //exit(0);
//#endif
      set = isl_set_upper_bound_si(set, isl_dim_set, 0, new_ub - 1);
      filter = isl_union_map_range(isl_union_map_intersect_domain(
                  isl_union_map_reverse(umap), isl_union_set_from_set(set)));                  
//#ifdef _DEBUG
//      DBGSET(stdout, set, ctx);
//      exit(0);
//#endif
      while (isl_schedule_node_get_type(node) != isl_schedule_node_band) {
        node = isl_schedule_node_child(node, 0);
      }
      node = isl_schedule_node_insert_filter(node, filter);
      //node = isl_schedule_node_child(node, 0);           
      while (isl_schedule_node_has_parent(node)) {
        if (isl_schedule_node_get_type(node) == isl_schedule_node_mark) {
          isl_id *id;
          id = isl_schedule_node_mark_get_id(node);
          if (!strcmp(isl_id_get_name(id), "simd")) {
            isl_id_free(id);
            break;
          }
          isl_id_free(id);
        }
        node = isl_schedule_node_parent(node);
      }
    }    
  }

  return node;
}

/* Insert a "hls_unroll" mark after the "simd" mark.
 * The loop will be eventually unrolled.
 * The "hls_unroll" mark is placed under the band node.
 */
static __isl_give isl_schedule_node *insert_unroll_mark(
  __isl_take isl_schedule_node *node, void *user)
{
  struct autosa_kernel *kernel = (struct autosa_kernel *)user;
  isl_ctx *ctx = kernel->ctx;

  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark)
  {
    isl_id *id;

    id = isl_schedule_node_mark_get_id(node);
    if (!strcmp(isl_id_get_name(id), "simd"))
    {
      isl_id *hls_id;
      hls_id = isl_id_alloc(ctx, "hls_unroll", NULL);
      
      if (kernel->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
        /* The hls_unroll will be inserted above the loop. */
        node = isl_schedule_node_child(node, 0);        
        node = isl_schedule_node_insert_mark(node, hls_id);        
        node = isl_schedule_node_parent(node);
      } else {
        node = isl_schedule_node_child(node, 0);
        node = isl_schedule_node_child(node, 0);
        node = isl_schedule_node_insert_mark(node, hls_id);
        node = isl_schedule_node_parent(node);
        node = isl_schedule_node_parent(node);
      }      
    }
    isl_id_free(id);
  }

  return node;
}

/* Insert a context node at "node" introducing the PE identifiers 
 * along with their bounds, which are stored in kernel->sa_grid_size.
 */
static __isl_give isl_schedule_node *insert_context(struct autosa_kernel *kernel,
                                                    __isl_take isl_schedule_node *node)
{
  isl_set *context;

  context = isl_set_universe(isl_set_get_space(kernel->context));
  context = add_bounded_parameters_dynamic(context,
                                           kernel->sa_grid_size, kernel->pe_ids);
  node = isl_schedule_node_insert_context(node, context);

  return node;
}

/* Create the local buffer variables inside the PE.
 * Specifically, we will also scan through all IO groups for the array,
 * find the lcm of all the data packing factors to set as the array partitioning
 * factor for the local buffer so that all I/O groups should be able to 
 * access the packed elements without any bank conflict.
 */
static void create_pe_module_var(isl_ctx *ctx,
                                 struct autosa_kernel *kernel,
                                 struct autosa_array_ref_group *group,
                                 struct autosa_kernel_var *var, struct autosa_local_array_info *local,
                                 const char *suffix, int sparse_modify_size)
{
  struct autosa_array_tile *tile;
  isl_printer *p;
  isl_val *lcm = isl_val_int_from_si(ctx, 1);

  var->array = group->array;
  var->type = autosa_array_ref_group_type(group);
  var->n_lane = 1;
  /* Scan all the I/O groups, and compute the lcm of the group SIMD factors,
   * set it as the partition factor of the variable. */
  for (int i = 0; i < local->n_io_group; i++)
  {
    struct autosa_array_ref_group *io_group = local->io_groups[i];
    isl_val *val = isl_val_int_from_si(ctx, io_group->n_lane);
    isl_val *product = isl_val_mul(isl_val_copy(val), isl_val_copy(lcm));
    isl_val *gcd = isl_val_gcd(val, lcm);
    lcm = isl_val_div(product, gcd);
  }  
  var->n_part = isl_val_get_num_si(lcm);  
  isl_val_free(lcm);

  tile = autosa_array_ref_group_tile(group);

  p = isl_printer_to_str(ctx);
  p = autosa_array_ref_group_print_name(group, p);
  if (suffix) {
    p = isl_printer_print_str(p, suffix);
  }
  var->name = isl_printer_get_str(p);
  isl_printer_free(p);

  if (tile == NULL)
  {
    var->size = isl_vec_alloc(ctx, 1);
    var->size = isl_vec_set_element_si(var->size, 0, 1);
  }
  else
  {
    var->size = isl_vec_alloc(ctx, group->array->n_index);
    for (int i = 0; i < group->array->n_index; ++i)
    {
      isl_val *size;

      size = isl_val_copy(tile->bound[i].size);
      
      if (i == group->array->n_index - 1) {
        if (group->local_array->is_sparse || sparse_modify_size) {
          size = isl_val_mul_ui(size, kernel->n_nzero);
          size = isl_val_div_ui(size, kernel->vec_len);
        }
      }
      var->size = isl_vec_set_element_val(var->size, i, size);
    }
  }
}

/* Create the local buffer variables inside the PE module. */
static isl_stat create_pe_module_vars(struct autosa_hw_module *module,
                                      struct autosa_kernel *kernel)
{
  int n = 0;
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];

    for (int j = 0; j < array->n_pe_group; j++)
    {
      struct autosa_array_ref_group *group = array->pe_groups[j];
      enum autosa_group_access_type type;

      type = autosa_array_ref_group_type(group);
      if (type != AUTOSA_ACCESS_GLOBAL)
        n++;      
    }
  }

  module->var = isl_calloc_array(kernel->ctx, struct autosa_kernel_var, n);
  if (!module->var)
    return isl_stat_error;
  module->n_var = n;

  n = 0;
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];

    for (int j = 0; j < array->n_pe_group; j++)
    {
      struct autosa_array_ref_group *group = array->pe_groups[j];
      enum autosa_group_access_type type;

      type = autosa_array_ref_group_type(group);
      if (type == AUTOSA_ACCESS_GLOBAL)
        continue;
      if (kernel->sparse && array->array_type == AUTOSA_EXT_ARRAY && array->is_sparse == 0) {        
        create_pe_module_var(kernel->ctx, kernel, group, &module->var[n], array, NULL, 1);
        n++;
      } else {
        create_pe_module_var(kernel->ctx, kernel, group, &module->var[n], array, NULL, 0);
        n++;
      }      
    }
  }

  return isl_stat_ok;
}

/* The "node" is pointed to the "PE" mark.
 */
static __isl_give isl_schedule_node *add_pe_ext_io_copies_dummy(
    struct autosa_kernel *kernel,
    struct autosa_local_array_info *local_array,
    struct autosa_array_ref_group *io_group,
    __isl_take isl_schedule_node *node, int read, int in, int reduce)
{
  isl_union_set *filter = isl_union_set_from_set(isl_set_empty(
      isl_set_get_space(kernel->context)));
  for (int i = 0; i < io_group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = io_group->refs[i];

    if ((io_group->local_array->array_type == AUTOSA_EXT_ARRAY) ||
       ((io_group->local_array->array_type == AUTOSA_INT_ARRAY) && 
       (read && ref->read) || (!read && ref->write)))
    {
      struct autosa_array_ref_group *pe_group = autosa_find_pe_group(
          local_array, io_group, ref);
      struct autosa_add_pe_ext_io_copies_data data =
          {kernel, pe_group, io_group, ref, 1, in, 1, reduce, NULL};
      node = isl_schedule_node_map_descendant_bottom_up(node,
                                                        &add_pe_ext_io_copies_stmt, &data);
      filter = isl_union_set_union(filter, data.filter);
    }
  }

  filter = isl_union_set_coalesce(filter);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_filter(node, filter);
  node = isl_schedule_node_parent(node);
  return node;
}

/* Create the schedule for the PE dummy module that collects/sends the dummy data.
 * If "in" is 1, generate dummy module collects the dummy data.
 * Else, generate dummy module sends the dummy data.
 */
static __isl_give isl_schedule *pe_module_dummy_gen(struct autosa_gen *gen,
                                                    struct autosa_hw_module *module, 
                                                    struct autosa_array_ref_group *group,
                                                    int in)
{
  isl_schedule *schedule;
  isl_schedule_node *node;
  isl_id *id, *hw_id;
  struct autosa_kernel *kernel;

  schedule = gen->schedule;
  schedule = isl_schedule_dup(schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);
  node = autosa_tree_move_down_to_kernel(node);

  id = isl_schedule_node_mark_get_id(node);
  kernel = (struct autosa_kernel *)isl_id_get_user(id);
  isl_id_free(id);

  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  node = split_band(node, kernel->n_sa_dim);
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  node = add_pe_ext_io_copies_dummy(
            kernel, group->local_array, group, node, 1, in, 
            gen->options->autosa->local_reduce && group->attached_drain_group);

  if (gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Insert "pipeline" mark under the last "latency" mark. */
    node = isl_schedule_node_map_descendant_bottom_up(node,
                                                      &insert_pipeline_mark, kernel);
  }                                                    

  ///* Insert "unroll" mark under the last "simd" mark. */
  //node = isl_schedule_node_map_descendant_bottom_up(node,
  //                                                  &insert_unroll_mark, kernel);
  

  /* Add module mark after the kernel mark. */
  hw_id = isl_id_alloc(gen->ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, hw_id);

  /* Add the PE id filter. */
  node = autosa_tree_move_up_to_kernel(node);
  isl_schedule_node_child(node, 0);
  node = insert_context(kernel, node);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_filter(node,
                                         isl_union_set_copy(kernel->pe_filter));

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  return schedule;
}

/* Modify the input "schedule" to describe the PE module.
 * Set the schedule dimensions of space loops as parameters.
 *
 * For interior I/O groups
 * - add copy-in before PE computation (RAW, RAR)
 * - add copy-out after PE computation (RAW)
 *   - domain: S -> type[D -> access]
 *   - schedule: type[D -> access] -> tiling
 * For exterior I/O groups
 *   for each access in the group
 *   - add copy-in before user statement (RAW, RAR)
 *   - add copy-out after user statement (RAW, RAR)
 *     - domain: S -> type[D -> access]
 *     - schedule: type[D -> access] -> tiling 
 *       (if any, otherwise, create a register tiling)
 * For WAW group 
 * - for each access in the group
 *   - add write-out after user statement (WAW)
 *     - domain: S -> type[D -> access]
 *     - schedule: type[D -> access] -> tiling
 */
static __isl_give struct autosa_hw_module *sa_pe_module_gen(struct autosa_gen *gen)
{
  isl_schedule_node *node;
  isl_id *id;
  struct autosa_kernel *kernel;
  isl_schedule *schedule, *new_schedule;
  int single_statement;
  isl_union_set *domain;
  struct autosa_hw_module *module;
  isl_id *hw_id;

  module = autosa_hw_module_alloc(gen);

  /* Add the filters for PEs. */
  schedule = gen->schedule;
  schedule = isl_schedule_dup(schedule);
  node = isl_schedule_get_root(schedule);
  node = autosa_tree_move_down_to_kernel(node);

  id = isl_schedule_node_mark_get_id(node);
  kernel = (struct autosa_kernel *)isl_id_get_user(id);
  isl_id_free(id);
  single_statement = kernel->single_statement;
  domain = isl_schedule_node_get_domain(node);

  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  node = split_band(node, kernel->n_sa_dim);
  kernel->pe_ids = ppcg_scop_generate_names(gen->prog->scop,
                                            kernel->n_sa_dim, "p");
  kernel->pe_filter = set_schedule_modulo(node, kernel->pe_ids,
                                          kernel->sa_dim);
  kernel->sa_grid_size = extract_sa_grid_size(kernel, domain);

  /* Add the statements for I/O groups with exterior I/O at the user 
   * statement level. 
   * Add the statements for I/O group with interior I/O at the PE level.
   */
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  /* Add copy-in/copy-out statements */
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];
    for (int j = 0; j < array->n_io_group; j++)
    {
      struct autosa_array_ref_group *group = array->io_groups[j];      
      if (group->local_array->array_type == AUTOSA_EXT_ARRAY)
      {
        if (group->pe_io_dir == IO_IN || group->pe_io_dir == IO_INOUT)
          node = add_pe_ext_io_copies(kernel, array, group, node, 1);
        if (group->pe_io_dir == IO_OUT || group->pe_io_dir == IO_INOUT)
          node = add_pe_ext_io_copies(kernel, array, group, node, 0);        
      }
      else if (group->local_array->array_type == AUTOSA_INT_ARRAY)
      {
        if (group->io_type == AUTOSA_INT_IO)
        {
          if (group->pe_io_dir == IO_IN || group->pe_io_dir == IO_INOUT)
            node = add_pe_int_io_copies(kernel, array, group, node, 1);
          if (group->pe_io_dir == IO_OUT || group->pe_io_dir == IO_INOUT)
            node = add_pe_int_io_copies(kernel, array, group, node, 0);          
        }
        else
        {
          if (group->pe_io_dir == IO_IN || group->pe_io_dir == IO_INOUT)
            node = add_pe_ext_io_copies(kernel, array, group, node, 1);
          if (group->pe_io_dir == IO_OUT || group->pe_io_dir == IO_INOUT)
            node = add_pe_ext_io_copies(kernel, array, group, node, 0);          
        }
      }
      module->n_io_group++;
      module->io_groups = (struct autosa_array_ref_group **)realloc(
          module->io_groups,
          module->n_io_group * sizeof(struct autosa_array_ref_group *));
      module->io_groups[module->n_io_group - 1] = group;
    }
    if (array->drain_group && array->drain_group->array_io_dir != IO_NULL)
    {
      node = add_pe_ext_io_copies(kernel, array, array->drain_group, node, 0);

      module->n_io_group++;
      module->io_groups = (struct autosa_array_ref_group **)realloc(
          module->io_groups,
          module->n_io_group * sizeof(struct autosa_array_ref_group *));
      module->io_groups[module->n_io_group - 1] = array->drain_group;
    }
  }

  if (gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Insert "pipeline" mark under the last "latency" mark. */
    node = isl_schedule_node_map_descendant_bottom_up(node,
                                                      &insert_pipeline_mark, kernel);
  }

  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));

  /* Insert "unroll" mark under the last "simd" mark */
  node = isl_schedule_node_map_descendant_bottom_up(node,
                                                    &insert_unroll_mark, kernel);

  /* Tile the SIMD look for sparsity */
  if (kernel->sparse) {
    node = isl_schedule_node_map_descendant_bottom_up(node,
                                                      &tile_simd_sparse, kernel);
  }

  /* Add module mark after the kernel mark. */
  hw_id = isl_id_alloc(gen->ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, hw_id);

  if (gen->options->autosa->tuning_method == 1) {
    /* Generate another schedule for latency estimation. */    
    isl_schedule *tuning_sched = isl_schedule_node_get_schedule(node);
    module->tuning_num_sched = kernel->tuning_program->generate_tuning_schedule(tuning_sched);
  }

  /* Add the PE id filter. */
  node = autosa_tree_move_up_to_kernel(node);
  isl_schedule_node_child(node, 0);
  node = insert_context(kernel, node);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_filter(node,
                                         isl_union_set_copy(kernel->pe_filter));

  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));

  if (gen->options->autosa->tuning_method == 1) {
    /* Generate another schedule for latency estimation. */    
    isl_schedule *tuning_sched = isl_schedule_node_get_schedule(node);
    module->tuning_sched = kernel->tuning_program->generate_tuning_schedule(tuning_sched);    
  }

  isl_schedule_free(schedule);
  new_schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  module->sched = new_schedule;
  module->type = PE_MODULE;
  module->name = strdup("PE");
  module->inst_ids = isl_id_list_copy(kernel->pe_ids);
  create_pe_module_vars(module, kernel);
  module->kernel = kernel;

  /* For io group with exterior I/O, we create input and output ports for each
   * PE. However, for the first/last PE on the data transfer direction, 
   * the input/output port consumes/produces dummy data. 
   * We add dummy modules to handle these cases to consume the dummy data.
   * 
   * In addition, when local reduce is enabled, the boundary PEs should only take 
   * in init values (i.e., 0), we will also add dummy module for such a case.
   */
  module->n_pe_dummy_modules = 0;
  module->pe_dummy_modules = NULL;
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];
    //if (array->array_type == AUTOSA_INT_ARRAY)
    //  continue;
    for (int j = 0; j < array->n_io_group; j++)
    {
      struct autosa_array_ref_group *group = array->io_groups[j];
      if (group->io_type == AUTOSA_INT_IO)
        continue;
      if (group->pe_io_dir != IO_INOUT)
        continue;
      if (group->copy_in == 0 && group->copy_out == 0)
        continue;

      /* Generate the dummy module. */
      isl_schedule *sched;
      int in = array->array_type == AUTOSA_INT_ARRAY? 0 : 1;

      sched = pe_module_dummy_gen(gen, module, group, in);
      module->n_pe_dummy_modules++;
      module->pe_dummy_modules =
          (struct autosa_pe_dummy_module **)realloc(module->pe_dummy_modules,
                                                    module->n_pe_dummy_modules * sizeof(struct autosa_pe_dummy_module *));
      struct autosa_pe_dummy_module *dummy_module = autosa_pe_dummy_module_alloc();
      dummy_module->module = module;
      dummy_module->io_group = group;
      dummy_module->sched = sched;
      dummy_module->in = in;
      module->pe_dummy_modules[module->n_pe_dummy_modules - 1] = dummy_module;
    }
  }

  return module;
}

/* The input modules are organized in the sequence of:
 * PE module
 * I/O module (copy-in and copy-out)
 * Drain module
 * We will reorder the modules following the below sequence:
 * I/O module (copy-in) 
 * PE module 
 * I/O module (copy-out)
 * Drain module
 * The reason for the re-ordering is for CSim to proceed in Xilinx environment.
 */
static __isl_give struct autosa_hw_module **hw_module_reorder(
    __isl_take struct autosa_hw_module **modules, int n_module)
{
  struct autosa_hw_module **modules_new = (struct autosa_hw_module **)
      malloc(n_module * sizeof(struct autosa_hw_module *));
  int pos = 0;

  /* I/O module (copy-in) */
  for (int i = 0; i < n_module; i++)
  {
    struct autosa_hw_module *module = modules[i];
    if (module->type == IO_MODULE && module->in)
    {
      modules_new[pos] = module;
      pos++;
    }
  }

  /* PE module */
  modules_new[pos] = modules[0];
  pos++;

  /* I/O module (copy-out) */
  for (int i = 0; i < n_module; i++)
  {
    struct autosa_hw_module *module = modules[i];
    if (module->type == IO_MODULE && !module->in)
    {
      modules_new[pos] = module;
      pos++;
    }
  }

  /* Drain module */
  for (int i = 0; i < n_module; i++)
  {
    struct autosa_hw_module *module = modules[i];
    if (module->type == DRAIN_MODULE)
    {
      modules_new[pos] = module;
      pos++;
    }
  }

  free(modules);
  return modules_new;
}

/* Create the schedule that calls all the PE dummy modules.
 * We will work on the transformed IO schedule for the io group.
 * We delete the schedule nodes above the array mark and below the PE mark,
 * add a filter to only consider the last module in the transfer chain.
 * Then insert the module call extension nodes right under the space bands.
 */
static __isl_give isl_schedule *pe_dummy_gen_module_call(struct autosa_gen *gen,
                                                         struct autosa_pe_dummy_module *pe_dummy_module)
{
  struct autosa_array_ref_group *group;
  isl_schedule *sched;
  isl_schedule_node *node;
  struct autosa_kernel *kernel;
  struct autosa_hw_module *module;
  int n_member;
  isl_union_set *L1_filter;
  isl_bool insert_L1 = isl_bool_false;
  isl_printer *p_str;
  isl_ctx *ctx;
  char *stmt_name;
  isl_id *id;
  isl_union_map *prefix, *extension;
  isl_union_set *domain, *range;

  module = pe_dummy_module->module;
  kernel = module->kernel;
  ctx = gen->ctx;
  group = pe_dummy_module->io_group;
  sched = isl_schedule_dup(group->io_L1_schedule);
  node = isl_schedule_get_root(sched);
  isl_schedule_free(sched);
  isl_space *space;
  isl_union_set *empty_filter;
  isl_schedule_node *graft;  
  int lower_band_num = -1;

  /* Delete the node above the array mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);
  while (!(autosa_tree_node_is_kernel(node) || isl_schedule_node_get_type(node) == isl_schedule_node_context)) {
    node = isl_schedule_node_delete(node);
    node = isl_schedule_node_parent(node);
  }

//#ifdef _DEBUG
//  if (!strcmp(group->array->name, "U_tmp") && pe_dummy_module->in == 0) {
//    printf("here\n");
//    printf("group id: %d\n", group->nr);
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//    isl_schedule *sched_tmp = isl_schedule_node_get_schedule(node);
//    print_code(gen, isl_schedule_copy(sched_tmp), "U_tmp_out.c");
//    isl_schedule_free(sched_tmp);
//  }
//#endif

  /* Insert a filter. */
  node = autosa_tree_move_down_to_mark(node, kernel->core, "io_L1");
  node = isl_schedule_node_parent(node);
  n_member = isl_schedule_node_band_n_member(node);
  if (n_member > 1)
  {
    node = isl_schedule_node_band_split(node, n_member - 1);
    node = isl_schedule_node_child(node, 0);
  }
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    if (pe_dummy_module->in)
      L1_filter = schedule_eq_ub(node);
    else
      L1_filter = schedule_eq_lb(node);    
    insert_L1 = isl_bool_true;
  }

//#ifdef _DEBUG
//  if (!strcmp(group->array->name, "U_tmp") && pe_dummy_module->in == 0) {
//    DBGUSET(stdout, L1_filter, gen->ctx);
//  }
//#endif

//#ifdef _DEBUG
//  if (!strcmp(group->array->name, "U_tmp") && !pe_dummy_module->in)
//    DBGUSET(stdout, L1_filter, isl_schedule_node_get_ctx(node));
//#endif

  node = autosa_tree_move_down_to_mark(node, kernel->core, "io_L1");
  node = isl_schedule_node_child(node, 0);
  if (insert_L1)
  {
    node = isl_schedule_node_insert_filter(node, L1_filter);
  }

  /* Delete the node under the pe mark. */
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  node = isl_schedule_node_cut(node);

  /* Make the ancestors atomic */
  node = autosa_atomic_ancestors(node);

//#ifdef _DEBUG
//  if (!strcmp(group->array->name, "U_tmp") && pe_dummy_module->in == 0) {
//    printf("here\n");
//    printf("group id: %d\n", group->nr);
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//    isl_schedule *sched_tmp = isl_schedule_node_get_schedule(node);
//    print_code(gen, isl_schedule_copy(sched_tmp), "U_tmp_out2.c");
//    isl_schedule_free(sched_tmp);
//  }
//#endif

  /* Test if the range of the last dimension contains single element */
  lower_band_num = get_last_sched_dim_val(node);

//#ifdef _DEBUG
//  if (!strcmp(group->array->name, "U_tmp") && pe_dummy_module->in) {
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//  }
//#endif

  /* Graft an extension node. */
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  domain = isl_union_map_range(prefix);

  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "module_call.");
  p_str = autosa_array_ref_group_print_prefix(group, p_str);
  p_str = isl_printer_print_str(p_str, "_PE_dummy");
  p_str = isl_printer_print_str(p_str, pe_dummy_module->in? "_in" : "_out");
  p_str = isl_printer_print_str(p_str, ".0.0");
  if (lower_band_num != -1) {
    p_str = isl_printer_print_str(p_str, ".");
    p_str = isl_printer_print_int(p_str, lower_band_num);
  }
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  space = isl_space_set_alloc(ctx, 0, 1);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name);
  free(stmt_name);

  isl_point *pnt = isl_point_zero(space);
  isl_set *set = isl_set_from_point(pnt);
  range = isl_union_set_from_set(isl_set_copy(set));
  extension = isl_union_map_from_domain_and_range(domain, range);
  graft = isl_schedule_node_from_extension(extension);

  isl_map *map = isl_set_identity(set);
  map = isl_map_reset_tuple_id(map, isl_dim_out);
  isl_union_map *umap = isl_union_map_from_map(map);
  isl_multi_union_pw_aff *mupa = isl_multi_union_pw_aff_from_union_map(umap);

  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
  graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_atomic);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, graft, isl_schedule_node_get_ctx(node));
//#endif

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);

  /* Insert an empty filter. */
  empty_filter = isl_union_set_from_set(isl_set_empty(
      isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  /* Add module mark after the kernel mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  /* Add pe_dummy module mark after the module mark. */
  id = isl_id_alloc(ctx, "pe_dummy_module", pe_dummy_module);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  sched = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  return sched;
}

/* Create the schedule that calls all the PE modules.
 * We delete the schedule nodes above the array mark and below the PE mark,
 * then insert the module call extension nodes right under the space bands.
 */
static isl_stat top_module_pe_gen_module_call(struct autosa_gen *gen,
                                              struct autosa_hw_top_module *top, struct autosa_hw_module *module)
{
  isl_schedule *schedule;
  isl_schedule_node *node, *graft;
  isl_id *id;
  struct autosa_kernel *kernel = gen->kernel;
  isl_space *space;
  isl_ctx *ctx;
  isl_union_set *domain;
  isl_union_set *empty_filter;
  isl_printer *p_str;
  char *stmt_name;

  schedule = gen->schedule;
  schedule = isl_schedule_dup(schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);
  ctx = isl_schedule_node_get_ctx(node);

  /* Delete the node above the array mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);
  while (!autosa_tree_node_is_kernel(node))
  {
    node = isl_schedule_node_delete(node);
    node = isl_schedule_node_parent(node);
  }

  /* Delete the node under the pe mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  node = split_band(node, kernel->n_sa_dim);

  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_cut(node);

  /* Graft an extension node. */
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "module_call.");
  p_str = isl_printer_print_str(p_str, module->name);
  p_str = isl_printer_print_str(p_str, ".0.0");
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name);
  free(stmt_name);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft = isl_schedule_node_from_domain(domain);

  node = isl_schedule_node_graft_before(node, graft);

  /* Insert an empty filter */
  empty_filter = isl_union_set_from_set(isl_set_empty(
      isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  /* Add module mark after the kernel mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  top->n_module_calls++;
  top->module_call_scheds = (isl_schedule **)realloc(top->module_call_scheds,
                                                     top->n_module_calls * sizeof(isl_schedule *));
  top->module_call_scheds[top->n_module_calls - 1] = schedule;

  if (module->n_pe_dummy_modules > 0 && gen->options->target != AUTOSA_TARGET_CATAPULT_HLS_C)
  {
    int inserted = 0;
    /* Generate dummy module calls. */
    for (int i = 0; i < module->n_pe_dummy_modules; i++)
    {
      struct autosa_pe_dummy_module *pe_dummy_module;
      isl_schedule *sched;

      pe_dummy_module = module->pe_dummy_modules[i];
      sched = pe_dummy_gen_module_call(gen, pe_dummy_module);

      top->n_module_calls++;
      top->module_call_scheds = (isl_schedule **)realloc(top->module_call_scheds,
                                                         top->n_module_calls * sizeof(isl_schedule *));
      /* If the module is out, we need to place it before the PE module call. */
      if (!pe_dummy_module->in) {        
        for (int j = top->n_module_calls - 2; j >= top->n_module_calls - 1 - inserted - 1; j--)
          top->module_call_scheds[j + 1] = top->module_call_scheds[j];
        top->module_call_scheds[top->n_module_calls - 1 - inserted - 1] = sched;
      } else {
        top->module_call_scheds[top->n_module_calls - 1] = sched;
      }
      inserted++;
    }
  }

  return isl_stat_ok;
}

/* Generate the schedule that declares the fifos used in PEs. 
 * If the io group data transfer direciton at the PE level is INOUT,
 * we will add another extension node at the boundary of the transfer chain
 * to declare one more fifo.
 */
static isl_stat top_module_pe_gen_fifo_decl(struct autosa_gen *gen,
                                            struct autosa_hw_top_module *top, struct autosa_hw_module *module)
{
  isl_schedule *schedule;
  isl_schedule_node *node, *graft;
  isl_id *id;
  struct autosa_kernel *kernel = gen->kernel;
  isl_space *space;
  isl_ctx *ctx = gen->ctx;
  isl_union_set *domain;
  isl_union_set *empty_filter;
  isl_printer *p_str;
  char *stmt_name;

  for (int i = 0; i < module->n_io_group; i++)
  {
    struct autosa_array_ref_group *group = module->io_groups[i];
    isl_multi_aff *io_trans;
    isl_mat *io_trans_mat;
    isl_id *id;
    isl_union_set *L1_filter = NULL;
    bool insert_L1 = isl_bool_false;
    if (group->pe_io_dir == IO_NULL)
      continue;

    schedule = isl_schedule_dup(group->io_L1_schedule);
    node = isl_schedule_get_root(schedule);
    isl_schedule_free(schedule);

    /* Delete the node above the array mark. */
    node = autosa_tree_move_down_to_array(node, kernel->core);
    node = isl_schedule_node_parent(node);
    while (!autosa_tree_node_is_kernel(node))
    {
      node = isl_schedule_node_delete(node);
      node = isl_schedule_node_parent(node);
    }

    if (group->pe_io_dir == IO_INOUT)
    {
      int n_member;
      node = autosa_tree_move_down_to_mark(node, kernel->core, "io_L1");
      node = isl_schedule_node_parent(node);
      n_member = isl_schedule_node_band_n_member(node);
      node = isl_schedule_node_band_split(node, n_member - 1);
      node = isl_schedule_node_child(node, 0);
      if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
      {
        L1_filter = schedule_eq_ub(node);
        insert_L1 = isl_bool_true;
      }
      node = autosa_tree_move_up_to_array(node);
    }

    /* Delete the node under the pe mark. */
    node = autosa_tree_move_down_to_pe(node, kernel->core);
    node = isl_schedule_node_cut(node);

    /* Graft an extension node. */
    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "fifo_decl.");
    p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
    stmt_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    space = isl_space_set_alloc(ctx, 0, 0);
    id = isl_id_alloc(ctx, stmt_name, group);
    space = isl_space_set_tuple_id(space, isl_dim_set, id);
    free(stmt_name);
    domain = isl_union_set_from_set(isl_set_universe(space));
    graft = isl_schedule_node_from_domain(domain);

    node = isl_schedule_node_graft_before(node, graft);

    if (insert_L1)
    {
      isl_set *set;
      isl_multi_union_pw_aff *mupa;
      isl_union_map *prefix;
      isl_union_set *domain;
      isl_union_set *range;
      isl_union_map *extension;
      isl_map *map;
      isl_union_map *umap;

      /* Graft an extension node for boundary PE. */
      node = isl_schedule_node_insert_filter(node, L1_filter);
      node = isl_schedule_node_child(node, 0);
      prefix = isl_schedule_node_get_prefix_schedule_relation(node);
      prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                                isl_union_pw_multi_aff_copy(kernel->contraction));
      domain = isl_union_map_range(prefix);

      p_str = isl_printer_to_str(ctx);
      p_str = isl_printer_print_str(p_str, "fifo_decl_boundary.");
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      stmt_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);
      space = isl_space_set_alloc(ctx, 0, 1);
      id = isl_id_alloc(ctx, stmt_name, group);
      space = isl_space_set_tuple_id(space, isl_dim_set, id);
      free(stmt_name);

      isl_point *pnt = isl_point_zero(space);
      set = isl_set_from_point(pnt);
      range = isl_union_set_from_set(isl_set_copy(set));

      extension = isl_union_map_from_domain_and_range(domain, range);
      graft = isl_schedule_node_from_extension(extension);

      map = isl_set_identity(set);
      map = isl_map_reset_tuple_id(map, isl_dim_out);
      umap = isl_union_map_from_map(map);
      mupa = isl_multi_union_pw_aff_from_union_map(umap);

      graft = isl_schedule_node_child(graft, 0);
      graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

      while (graft && isl_schedule_node_has_parent(graft))
        graft = isl_schedule_node_parent(graft);

      node = isl_schedule_node_graft_before(node, graft);
    }
    else
    {
      isl_union_set_free(L1_filter);
    }

    /* Insert an empty filter. */
    empty_filter = isl_union_set_from_set(isl_set_empty(
        isl_set_get_space(kernel->context)));
    node = isl_schedule_node_insert_filter(node, empty_filter);

    /* Add module mark after the kernel mark. */
    id = isl_id_alloc(ctx, "module", module);
    node = autosa_tree_move_up_to_kernel(node);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_mark(node, id);

    schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);

    top->n_fifo_decls++;
    top->fifo_decl_scheds = (isl_schedule **)realloc(top->fifo_decl_scheds,
                                                     top->n_fifo_decls * sizeof(isl_schedule *));
    top->fifo_decl_scheds[top->n_fifo_decls - 1] = schedule;
    top->fifo_decl_names = (char **)realloc(top->fifo_decl_names,
                                            top->n_fifo_decls * sizeof(char *));
    /* Generate fifo_decl name in the format of 
     * [fifo_name].[fifo_width] 
     */
    p_str = isl_printer_to_str(ctx);
    p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
    p_str = isl_printer_print_str(p_str, "_");
    p_str = isl_printer_print_str(p_str, module->name);
    p_str = isl_printer_print_str(p_str, ".");
    int n_lane = get_io_group_n_lane(module, NULL, group);    
    int data_size = group->array->size;
    int width = data_size * n_lane; // in bytes
    p_str = isl_printer_print_int(p_str, width);
    top->fifo_decl_names[top->n_fifo_decls - 1] = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
  }

  return isl_stat_ok;
}

/* Generate module calls and fifo decls for the PE module. 
 */
static isl_stat top_module_pe_gen(struct autosa_gen *gen,
                                  struct autosa_hw_top_module *top, struct autosa_hw_module *module)
{
  /* Generate the function call schedule. */
  top_module_pe_gen_module_call(gen, top, module);

  /* Generate the fifo declaration schedule. */
  top_module_pe_gen_fifo_decl(gen, top, module);

  return isl_stat_ok;
}

/* The input "node" points to the node below io_[module->level] mark.
 * Return the node points to the "kernel" mark.
 * We will insert two module call extension nodes: 
 * module_call_upper: which contains the module name and arguments for the 
 * inter-module transfer
 * module_call_lower: which contains arguments for the intra-module transfer
 * (i.e., transfer to the lower-level modules)
 */
static __isl_give isl_schedule_node *io_gen_module_call(
    __isl_take isl_schedule_node *node, struct autosa_hw_module *module,
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    int boundary, int serialize,
    __isl_take isl_union_set *filter_domain)
{
  isl_printer *p_str;
  char *stmt_name;
  isl_space *space;
  isl_union_set *domain, *empty_filter, *lower_level_filter;
  isl_schedule_node *graft;
  isl_bool insert_lower = isl_bool_false;
  isl_ctx *ctx = isl_schedule_node_get_ctx(node);
  isl_id *id;
  isl_union_map *prefix, *extension, *umap;
  isl_union_set *range;
  isl_set *set;
  isl_map *map;
  isl_multi_union_pw_aff *mupa;
  int lower_band_num = -1;  
  isl_union_set *filter_range;
  isl_bool upper_inserted;

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  /* Collect the filter for the lower I/O module. */
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    if (module->level > 1)
    {
      if (module->to_pe) {
        if (module->in)
          lower_level_filter = schedule_eq_lb(node);
        else
          lower_level_filter = schedule_eq_ub(node);
      } else {
        lower_level_filter = schedule_eq_lb(node);
      }
      
      insert_lower = isl_bool_true;
    }
  }

  /* Graft an extension node for module call. */
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);  
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  domain = isl_union_map_range(isl_union_map_copy(prefix));
  if (filter_domain) {
    filter_range = isl_union_set_apply(isl_union_set_copy(filter_domain), isl_union_map_copy(prefix));
    domain = isl_union_set_intersect(domain, filter_range);
  }
  isl_union_map_free(prefix);

  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "module_call_upper.");
  p_str = isl_printer_print_str(p_str, module->name);  
  if (boundary) 
    p_str = isl_printer_print_str(p_str, ".1");
  else
    p_str = isl_printer_print_str(p_str, ".0");
  if (serialize)
    p_str = isl_printer_print_str(p_str, ".1");
  else
    p_str = isl_printer_print_str(p_str, ".0");

  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  space = isl_space_set_alloc(ctx, 0, 1);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name);
  free(stmt_name);

  isl_point *pnt = isl_point_zero(space);
  set = isl_set_from_point(pnt);
  range = isl_union_set_from_set(isl_set_copy(set));

  extension = isl_union_map_from_domain_and_range(domain, range);
  graft = isl_schedule_node_from_extension(extension);

  map = isl_set_identity(set);
  map = isl_map_reset_tuple_id(map, isl_dim_out);
  umap = isl_union_map_from_map(map);
  mupa = isl_multi_union_pw_aff_from_union_map(umap);

  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);

  if (module->level > 1)
  {
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level - 1);
  }
  node = isl_schedule_node_cut(node);

  /* Graft an extension node for lower level transfer. */
  if (insert_lower)
  {    
    if (module->to_pe) {
      node = isl_schedule_node_insert_filter(node, lower_level_filter);
      node = isl_schedule_node_child(node, 0);
    } else {
      /* In case the lower band only contains one element, we will compute the 
       * value and append to the module_call name.
       */
      isl_schedule_node *node_tmp;
      node_tmp = isl_schedule_node_copy(node);
      node_tmp = isl_schedule_node_parent(node_tmp); // band
      node_tmp = isl_schedule_node_insert_filter(node_tmp, isl_union_set_copy(lower_level_filter));
      node_tmp = isl_schedule_node_child(node_tmp, 0);
      lower_band_num = get_band_single_schedule_val(node_tmp);
      isl_schedule_node_free(node_tmp);

//#ifdef _DEBUG
//      if (!strcmp(module->name, "U_drain_IO_L2_out")) {
//        printf("test %d\n", lower_band_num);
//      }
//#endif 

      node = isl_schedule_node_insert_filter(node, lower_level_filter);
      node = isl_schedule_node_child(node, 0);

      //node = isl_schedule_node_parent(node); // band
      //node = isl_schedule_node_insert_filter(node, lower_level_filter);
      //node = isl_schedule_node_child(node, 0);      
      //lower_band_num = get_band_single_schedule_val(node);     
      //node = isl_schedule_node_child(node, 0);
    }
  }
  {
    isl_union_map *prefix;
    isl_union_set *domain, *range;
    isl_point *pnt;
    isl_set *set;
    isl_union_map *extension, *umap;
    isl_map *map;
    isl_multi_union_pw_aff *mupa;

    prefix = isl_schedule_node_get_prefix_schedule_relation(node);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              isl_union_pw_multi_aff_copy(kernel->contraction));
    domain = isl_union_map_range(isl_union_map_copy(prefix));
    if (filter_domain) {
      filter_range = isl_union_set_apply(isl_union_set_copy(filter_domain), isl_union_map_copy(prefix));    
      domain = isl_union_set_intersect(domain, filter_range);     
    }
    isl_union_map_free(prefix);

    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "module_call_lower.");
    p_str = isl_printer_print_str(p_str, module->name);    
    if (boundary) 
      p_str = isl_printer_print_str(p_str, ".1");
    else
      p_str = isl_printer_print_str(p_str, ".0");
    if (serialize)
      p_str = isl_printer_print_str(p_str, ".1");
    else
      p_str = isl_printer_print_str(p_str, ".0");

    if (lower_band_num != -1) {
      p_str = isl_printer_print_str(p_str, ".");
      p_str = isl_printer_print_int(p_str, lower_band_num);
    }

    stmt_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    space = isl_space_set_alloc(ctx, 0, 1);
    id = isl_id_alloc(ctx, stmt_name, group);
    space = isl_space_set_tuple_id(space, isl_dim_set, id);
    free(stmt_name);

    pnt = isl_point_zero(space);
    set = isl_set_from_point(pnt);
    range = isl_union_set_from_set(isl_set_copy(set));

    /* Build an identical union map from domain.
     * Project out the range dims and only keep the last dim.
     * Set the range name as stmt_name. */    
    extension = isl_union_map_from_domain_and_range(domain, range);
    graft = isl_schedule_node_from_extension(extension);

    map = isl_set_identity(set);
    map = isl_map_reset_tuple_id(map, isl_dim_out);
    umap = isl_union_map_from_map(map);
    mupa = isl_multi_union_pw_aff_from_union_map(umap);

    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

    while (graft && isl_schedule_node_has_parent(graft))
      graft = isl_schedule_node_parent(graft);

    node = isl_schedule_node_graft_after(node, graft);

//    if (!strcmp(module->name, "U_drain_IO_L2_out")) {
//      DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//    }    
  }

  /* Insert an empty filter. */
  empty_filter = isl_union_set_from_set(isl_set_empty(isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  node = autosa_tree_move_up_to_kernel(node);
  isl_union_set_free(filter_domain);

  return node;
}

/* The input "node" points to the node below io_[module->level] mark.
 * Return the node points to the "kernel" mark.
 * We will insert one module call extension node: 
 * module_call_upper: which contains the module name and arguments for the 
 * inter-module transfer
 * This function is used for Intel OpenCL only. We will not generate 
 * the module_call_lower, which is define as below:
 * module_call_lower: which contains arguments for the intra-module transfer
 * (i.e., transfer to the lower-level modules)
 */
static __isl_give isl_schedule_node *io_gen_ext_module(
    __isl_take isl_schedule_node *node, struct autosa_hw_module *module,
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    int boundary)
{
  isl_printer *p_str;
  char *stmt_name;
  isl_space *space;
  isl_union_set *domain, *empty_filter, *lower_level_filter;
  isl_schedule_node *graft;
  isl_bool insert_lower = isl_bool_false;
  isl_ctx *ctx = isl_schedule_node_get_ctx(node);
  isl_id *id;
  isl_union_map *prefix, *extension, *umap;
  isl_union_set *range;
  isl_set *set;
  isl_map *map;
  isl_multi_union_pw_aff *mupa;

  /* Graft an extension node for module call. */
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  domain = isl_union_map_range(prefix);

  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "ext_module_upper.");
  p_str = isl_printer_print_str(p_str, module->name);
  if (boundary)
    p_str = isl_printer_print_str(p_str, ".boundary");
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  space = isl_space_set_alloc(ctx, 0, 0);
  space = isl_space_set_tuple_name(space, isl_dim_set, stmt_name);
  free(stmt_name);

  isl_point *pnt = isl_point_zero(space);
  set = isl_set_from_point(pnt);
  range = isl_union_set_from_set(isl_set_copy(set));

  extension = isl_union_map_from_domain_and_range(domain, range);
  graft = isl_schedule_node_from_extension(extension);

  map = isl_set_identity(set);
  map = isl_map_reset_tuple_id(map, isl_dim_out);
  umap = isl_union_map_from_map(map);
  mupa = isl_multi_union_pw_aff_from_union_map(umap);

  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

  while (graft && isl_schedule_node_has_parent(graft))
    graft = isl_schedule_node_parent(graft);

  node = isl_schedule_node_graft_before(node, graft);
  node = isl_schedule_node_cut(node);

  /* Insert an empty filter. */
  empty_filter = isl_union_set_from_set(isl_set_empty(isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  node = autosa_tree_move_up_to_kernel(node);

  return node;
}

/* Generate the calls for the io module connected to the external memory. 
 * This function is used for Intel OpenCL only.
 * Since all fifos will be replaced with channels later, this function only 
 * generates the upper module calls, ignoring the lower module call.
 */
static isl_stat top_module_io_gen_ext_module(
    struct autosa_gen *gen, struct autosa_hw_top_module *top,
    struct autosa_hw_module *module,
    struct autosa_array_ref_group *group)
{
  isl_schedule *schedule;
  isl_ctx *ctx = gen->ctx;
  isl_schedule_node *node, *graft;
  isl_id *id;
  struct autosa_kernel *kernel = gen->kernel;
  isl_printer *p_str;
  char *stmt_name;
  isl_space *space;
  isl_union_set *domain, *empty_filter, *lower_level_filter;
  isl_bool insert_lower = isl_bool_false;
  int boundary = module->boundary;
  isl_union_set *boundary_filter, *non_boundary_filter;
  isl_union_set_list *boundary_filters;
  isl_union_set *group_domain_filter;
  int single_ele = -1;
  isl_union_set *group_domain_filter_level;

  /* Only the top-level io module connected to the external memory is handled.
   */
  if (module->type == PE_MODULE || module->to_mem == 0)
    return isl_stat_ok;

  /* Transform the schedule. */
  schedule = isl_schedule_dup(group->io_schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);

  /* Compute the union of domains of all the array references in the group. */
  group_domain_filter = compute_io_group_domain(node, group, kernel, gen, module->in);  
  group_domain_filter = extend_io_group_domain(group_domain_filter, node, group, kernel, module->level);  
  group_domain_filter_level = compute_io_group_domain_at_level(group_domain_filter, node, group, kernel, module->level);    

  /* Delete the node above the array mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);  
  while (!(autosa_tree_node_is_kernel(node) || isl_schedule_node_get_type(node) == isl_schedule_node_context)) {
    node = isl_schedule_node_delete(node);
    node = isl_schedule_node_parent(node);
  }

  node = autosa_tree_move_up_to_kernel(node);

  /* Collect the filter for the boundary and non-boundary I/O module. */
  if (boundary && (module->level <= group->space_dim))
  {
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
    node = isl_schedule_node_parent(node);
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      /* Test if the band only contains one elmenet */
      isl_schedule_node *node_tmp;      
      node_tmp = isl_schedule_node_copy(node);
      if (group_domain_filter_level) {
        node_tmp = isl_schedule_node_insert_filter(node_tmp, isl_union_set_copy(group_domain_filter_level));
        node_tmp = isl_schedule_node_child(node_tmp, 0);
      }
      single_ele = get_band_single_schedule_val(node_tmp);
      if (single_ele == -1) {
        boundary_filter = schedule_eq_ub(node_tmp);
        non_boundary_filter = schedule_neq_ub(node_tmp);
      }
      isl_schedule_node_free(node_tmp);

      if (single_ele == -1) {
        boundary_filters = isl_union_set_list_from_union_set(non_boundary_filter);
        boundary_filters = isl_union_set_list_add(boundary_filters, boundary_filter);        

        node = isl_schedule_node_child(node, 0); // io_mark
        node = isl_schedule_node_child(node, 0); // band      
        node = isl_schedule_node_insert_sequence(node, boundary_filters);
        /* The node now is right below the io_[module->level] mark. */      
      } else {
        node = isl_schedule_node_child(node, 0); // io_mark
        node = isl_schedule_node_child(node, 0); // band
        node = isl_schedule_node_insert_filter(node, isl_union_set_copy(group_domain_filter_level));
        node = isl_schedule_node_child(node, 0); // band
      }
    }
  }
  else
  {
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
    node = isl_schedule_node_child(node, 0);
  }

  ///* Collect the filter for the boundary and non-boundary I/O module. */
  //if (boundary)
  //{
  //  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  //  node = isl_schedule_node_parent(node);
  //  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  //  {
  //    boundary_filter = schedule_eq_ub(node);
  //    non_boundary_filter = schedule_neq_ub(node);
  //    boundary_filters = isl_union_set_list_from_union_set(non_boundary_filter);
  //    boundary_filters = isl_union_set_list_add(boundary_filters, boundary_filter);
//
  //    node = isl_schedule_node_child(node, 0); // io_mark
  //    node = isl_schedule_node_child(node, 0); // band
  //    node = isl_schedule_node_insert_sequence(node, boundary_filters);
  //    /* The node now is right below the io_[module->level] mark. */
  //  }
  //}
  //else
  //{
  //  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  //  node = isl_schedule_node_child(node, 0);
  //}

  //if (boundary)
  //{
  //  node = isl_schedule_node_child(node, 0); // filter
  //  node = isl_schedule_node_child(node, 0); // band
  //  /* non-boundary */
  //  node = io_gen_ext_module(node, module, kernel, group, 0);
  //  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  //  node = isl_schedule_node_child(node, 0); // sequence
  //  node = isl_schedule_node_child(node, 1); // filter
  //  node = isl_schedule_node_child(node, 0); // band
  //  /* boundary */
  //  node = io_gen_ext_module(node, module, kernel, group, 1);
  //}
  //else
  //{
  //  node = io_gen_ext_module(node, module, kernel, group, 0);
  //}
  if (boundary && (module->level <= group->space_dim))
  {
    if (single_ele == -1) {
      node = isl_schedule_node_child(node, 0); // filter
      node = isl_schedule_node_child(node, 0); // band
      
      /* non-boundary */
      //node = io_gen_module_call(node, module, kernel, group, 0, serialize, isl_union_set_copy(group_domain_filter));
      node = io_gen_ext_module(node, module, kernel, group, 0);
      node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
      node = isl_schedule_node_child(node, 0); // sequence
      node = isl_schedule_node_child(node, 1); // filter
      node = isl_schedule_node_child(node, 0); // band

      /* boundary */
      //node = io_gen_module_call(node, module, kernel, group, 1, serialize, isl_union_set_copy(group_domain_filter));
      node = io_gen_ext_module(node, module, kernel, group, 1);
    } else {
      /* boundary */
      //node = io_gen_module_call(node, module, kernel, group, 1, serialize, isl_union_set_copy(group_domain_filter));
      node = io_gen_ext_module(node, module, kernel, group, 1);
    }
  } else {
    //node = io_gen_module_call(node, module, kernel, group, boundary, serialize, isl_union_set_copy(group_domain_filter));
    node = io_gen_ext_module(node, module, kernel, group, 0);
  }


  /* Cleanup the schedule tree. Remove "array" and "io_LX" mark.
   */
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  node = isl_schedule_node_delete(node);
  node = autosa_tree_move_up_to_array(node);
  node = isl_schedule_node_delete(node);
  node = autosa_tree_move_up_to_kernel(node);

  /* Add module mark after the kernel mark.auto */
  id = isl_id_alloc(ctx, "module", module);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);  

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);
  isl_union_set_free(group_domain_filter);
  isl_union_set_free(group_domain_filter_level);

  top->n_ext_module++;
  top->ext_module_scheds = (isl_schedule **)realloc(top->ext_module_scheds,
                                                    top->n_ext_module * sizeof(isl_schedule *));
  top->ext_module_scheds[top->n_ext_module - 1] = schedule;

  return isl_stat_ok;
}

/* Generate the module calls for the io module. 
 * If serialize is set as 1, we are generating the extra serialization module.
 */
static isl_stat top_module_io_gen_module_call(
    struct autosa_gen *gen, struct autosa_hw_top_module *top,
    struct autosa_hw_module *module,
    struct autosa_array_ref_group *group,
    int serialize)
{
  isl_schedule *schedule;
  isl_ctx *ctx = gen->ctx;
  isl_schedule_node *node, *graft;
  isl_id *id;
  struct autosa_kernel *kernel = gen->kernel;
  isl_printer *p_str;
  char *stmt_name;
  isl_space *space;
  isl_union_set *domain, *empty_filter, *lower_level_filter;
  isl_bool insert_lower = isl_bool_false;
  int boundary = module->boundary;
  isl_union_set *boundary_filter, *non_boundary_filter;
  isl_union_set_list *boundary_filters;
  isl_union_set *group_domain_filter;
  int single_ele = -1;
  isl_union_set *group_domain_filter_level;

  /* Transform the schedule. */
  schedule = isl_schedule_dup(group->io_schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);

  /* Compute the union of domains of all the array references in the group. */
  group_domain_filter = compute_io_group_domain(node, group, kernel, gen, module->in);  
  group_domain_filter = extend_io_group_domain(group_domain_filter, node, group, kernel, module->level);  
  group_domain_filter_level = compute_io_group_domain_at_level(group_domain_filter, node, group, kernel, module->level);    

  /* Delete the node above the array mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);  
  while (!(autosa_tree_node_is_kernel(node) || isl_schedule_node_get_type(node) == isl_schedule_node_context)) {
    node = isl_schedule_node_delete(node);
    node = isl_schedule_node_parent(node);
  }

  node = autosa_tree_move_up_to_kernel(node);

  /* Collect the filter for the boundary and non-boundary I/O module. */
  if (boundary && (module->level <= group->space_dim))
  {
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
    node = isl_schedule_node_parent(node);
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      /* Test if the band only contains one elmenet */
      isl_schedule_node *node_tmp;      
      node_tmp = isl_schedule_node_copy(node);
      if (group_domain_filter_level) {
        node_tmp = isl_schedule_node_insert_filter(node_tmp, isl_union_set_copy(group_domain_filter_level));
        node_tmp = isl_schedule_node_child(node_tmp, 0);
      }
      single_ele = get_band_single_schedule_val(node_tmp);
      if (single_ele == -1) {
        boundary_filter = schedule_eq_ub(node_tmp);
        non_boundary_filter = schedule_neq_ub(node_tmp);
      }
      isl_schedule_node_free(node_tmp);

//#ifdef _DEBUG
//      if (!strcmp(module->name, "U_drain_IO_L2_out")) {
//        printf("single ele: %d\n", single_ele);
//        DBGUSET(stdout, boundary_filter, ctx);
//        DBGUSET(stdout, non_boundary_filter, ctx);
//      }
//#endif

      if (single_ele == -1) {
        //boundary_filter = schedule_eq_ub(node);
        //non_boundary_filter = schedule_neq_ub(node);
//#ifdef _DEBUG
//        if (!strcmp(module->name, "A_IO_L2_in")) {
//          printf("single ele: %d\n", single_ele);
//          DBGUSET(stdout, boundary_filter, ctx);
//          DBGUSET(stdout, non_boundary_filter, ctx);
//        }
//#endif
        boundary_filters = isl_union_set_list_from_union_set(non_boundary_filter);
        boundary_filters = isl_union_set_list_add(boundary_filters, boundary_filter);        

        node = isl_schedule_node_child(node, 0); // io_mark
        node = isl_schedule_node_child(node, 0); // band      
        node = isl_schedule_node_insert_sequence(node, boundary_filters);
        /* The node now is right below the io_[module->level] mark. */      
      } else {
        node = isl_schedule_node_child(node, 0); // io_mark
        node = isl_schedule_node_child(node, 0); // band
        node = isl_schedule_node_insert_filter(node, isl_union_set_copy(group_domain_filter_level));
        node = isl_schedule_node_child(node, 0); // band
      }
    }
  }
  else
  {
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
    node = isl_schedule_node_child(node, 0);
  }

  if (boundary && (module->level <= group->space_dim))
  {
//#ifdef _DEBUG
//    DBGSCHDNODE(stdout, node, ctx);
//#endif
    if (single_ele == -1) {
      node = isl_schedule_node_child(node, 0); // filter
      node = isl_schedule_node_child(node, 0); // band
      
      //if (single_ele != -1) {
      //  /* boundary */
      //  node = io_gen_module_call(node, module, kernel, group, 1, serialize, isl_union_set_copy(group_domain_filter));  
      //} else {
      //  /* non-boundary */
      //  node = io_gen_module_call(node, module, kernel, group, 0, serialize, isl_union_set_copy(group_domain_filter));
      //}
      /* non-boundary */
      node = io_gen_module_call(node, module, kernel, group, 0, serialize, isl_union_set_copy(group_domain_filter));
      node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
      node = isl_schedule_node_child(node, 0); // sequence
      node = isl_schedule_node_child(node, 1); // filter
      node = isl_schedule_node_child(node, 0); // band
  
      /* boundary */
      node = io_gen_module_call(node, module, kernel, group, 1, serialize, isl_union_set_copy(group_domain_filter));
    } else {
      /* boundary */
      node = io_gen_module_call(node, module, kernel, group, 1, serialize, isl_union_set_copy(group_domain_filter));
    }
  }
  else 
  {
    node = io_gen_module_call(node, module, kernel, group, boundary, serialize, isl_union_set_copy(group_domain_filter));
  }

  /* Add module mark after the kernel mark.auto */
  id = isl_id_alloc(ctx, "module", module);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);
  isl_union_set_free(group_domain_filter);
  isl_union_set_free(group_domain_filter_level);

  top->n_module_calls++;
  top->module_call_scheds = (isl_schedule **)realloc(top->module_call_scheds,
                                                     top->n_module_calls * sizeof(isl_schedule *));
  top->module_call_scheds[top->n_module_calls - 1] = schedule;

  return isl_stat_ok;
}

/* Generate fifo decls for the I/O module.
 * Currently only works for filter I/O modules.
 */
static isl_stat top_module_io_gen_fifo_decl(struct autosa_gen *gen,
                                            struct autosa_hw_top_module *top,
                                            struct autosa_hw_module *module, struct autosa_array_ref_group *group)
{
  isl_schedule *schedule;
  isl_schedule_node *node, *graft;
  isl_union_set *filter = NULL, *empty_filter;
  struct autosa_kernel *kernel = gen->kernel;
  bool insert_filter = isl_bool_false;
  char *stmt_name;
  isl_space *space;
  isl_union_set *domain;
  isl_printer *p_str;
  isl_id *id;
  isl_ctx *ctx = gen->ctx;

  if (module->to_mem)
    return isl_stat_ok;

  schedule = isl_schedule_dup(group->io_schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);

  /* Delete the node above the array mark. */
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);
  while (!autosa_tree_node_is_kernel(node))
  {
    node = isl_schedule_node_delete(node);
    node = isl_schedule_node_parent(node);
  }

  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  node = isl_schedule_node_parent(node);
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    filter = schedule_eq_ub(node);
    insert_filter = isl_bool_true;
  }
  node = autosa_tree_move_up_to_array(node);
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, module->level);
  node = isl_schedule_node_cut(node);

  /* Graft an extension node. */
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "fifo_decl.");
  p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
  stmt_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  space = isl_space_set_alloc(ctx, 0, 0);
  id = isl_id_alloc(ctx, stmt_name, group);
  space = isl_space_set_tuple_id(space, isl_dim_set, id);
  free(stmt_name);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft = isl_schedule_node_from_domain(domain);

  node = isl_schedule_node_graft_before(node, graft);

  if (insert_filter)
  {
    isl_union_map *prefix, *extension, *umap;
    isl_union_set *domain, *range;
    isl_point *pnt;
    isl_set *set;
    isl_map *map;
    isl_multi_union_pw_aff *mupa;

    node = isl_schedule_node_insert_filter(node, filter);
    node = isl_schedule_node_child(node, 0);

    prefix = isl_schedule_node_get_prefix_schedule_relation(node);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              isl_union_pw_multi_aff_copy(kernel->contraction));
    domain = isl_union_map_range(prefix);

    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "fifo_decl_boundary.");
    p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
    stmt_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    space = isl_space_set_alloc(ctx, 0, 1);
    id = isl_id_alloc(ctx, stmt_name, group);
    space = isl_space_set_tuple_id(space, isl_dim_set, id);
    free(stmt_name);

    pnt = isl_point_zero(space);
    set = isl_set_from_point(pnt);
    range = isl_union_set_from_set(isl_set_copy(set));

    extension = isl_union_map_from_domain_and_range(domain, range);
    graft = isl_schedule_node_from_extension(extension);
    map = isl_set_identity(set);
    map = isl_map_reset_tuple_id(map, isl_dim_out);
    umap = isl_union_map_from_map(map);
    mupa = isl_multi_union_pw_aff_from_union_map(umap);

    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_partial_schedule(graft, mupa);

    while (graft && isl_schedule_node_has_parent(graft))
      graft = isl_schedule_node_parent(graft);

    node = isl_schedule_node_graft_before(node, graft);
  }

  /* Insert an empty filter. */
  empty_filter = isl_union_set_from_set(isl_set_empty(
      isl_set_get_space(kernel->context)));
  node = isl_schedule_node_insert_filter(node, empty_filter);

  /* Add module mark after the kernel mark. */
  id = isl_id_alloc(ctx, "module", module);
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_mark(node, id);

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  top->n_fifo_decls++;
  top->fifo_decl_scheds = (isl_schedule **)realloc(top->fifo_decl_scheds,
                                                   top->n_fifo_decls * sizeof(isl_schedule *));
  top->fifo_decl_scheds[top->n_fifo_decls - 1] = schedule;
  top->fifo_decl_names = (char **)realloc(top->fifo_decl_names,
                                          top->n_fifo_decls * sizeof(char *));
  /* Generate fifo_decl name in the format of
   * [fifo_name].[fifo_width]
   */
  p_str = isl_printer_to_str(ctx);
  p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
  p_str = isl_printer_print_str(p_str, "_");
  p_str = isl_printer_print_str(p_str, module->name);
  p_str = isl_printer_print_str(p_str, ".");
  int n_lane = get_io_group_n_lane(module, NULL, group);
  int data_size = group->array->size;
  int width = data_size * n_lane; // in bytes
  p_str = isl_printer_print_int(p_str, width);
  top->fifo_decl_names[top->n_fifo_decls - 1] = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return isl_stat_ok;
}

/* Generate the module calls and fifo decls for the io group. */
static isl_stat top_module_io_gen(struct autosa_gen *gen,
                                  struct autosa_hw_top_module *top,
                                  struct autosa_hw_module *module)
{
  struct autosa_array_ref_group *group;
  assert(module->n_io_group == 1);
  group = module->io_groups[0];

  /* Generate the function call schedule. */
  if (module->is_serialized && module->in) {
    /* Generate an axtra function call schedule for the serialize module. */
    top_module_io_gen_module_call(gen, top, module, group, 1);
  }
  top_module_io_gen_module_call(gen, top, module, group, 0);
  if (module->is_serialized && !module->in) {
    /* Generate an axtra function call schedule for the serialize module. */
    top_module_io_gen_module_call(gen, top, module, group, 1);
  }

  /* Generate the fifo declaration schedule. */
  top_module_io_gen_fifo_decl(gen, top, module, group);

  /* Generate the external memory module arguments setting schedule. */
  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL)
  {
    top_module_io_gen_ext_module(gen, top, module, group);
  }

  return isl_stat_ok;
}

/* Generate the top module that contains module calls and fifo declarations. */
__isl_give struct autosa_hw_top_module *sa_top_module_gen(struct autosa_gen *gen)
{
  struct autosa_hw_top_module *top_module;

  top_module = autosa_hw_top_module_alloc();
  top_module->hw_modules = gen->hw_modules;
  top_module->kernel = gen->kernel;
  top_module->n_hw_modules = gen->n_hw_modules;

  for (int i = 0; i < gen->n_hw_modules; i++)
  {
    struct autosa_hw_module *module = gen->hw_modules[i];
    if (module->type == PE_MODULE)
    {
      top_module_pe_gen(gen, top_module, gen->hw_modules[i]);
    }
    else
    {
      top_module_io_gen(gen, top_module, gen->hw_modules[i]);
    }
  }

  return top_module;
}

/* Build new schedules for each hardware components.
 * The total number of schedules = 
 * [1. the default schedule (CPU code)]
 * 2. PE schedule
 * 3. I/O module schedule
 * 4. drain module schedule
 * 5. top module schedule
 */
void generate_hw_modules(__isl_take isl_schedule *schedule,
                         struct autosa_gen *gen, struct autosa_kernel *kernel)
{
  gen->schedule = schedule;
  gen->n_hw_modules = 1;
  gen->hw_modules = isl_calloc_array(gen->ctx,
                                     struct autosa_hw_module *, gen->n_hw_modules);
  gen->hw_modules[0] = NULL;
  
  /* IO module */
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *info = &kernel->array[i];    
    info->n_io_group_refs = 0;
    for (int j = 0; j < info->n_io_group; j++)
    {
      int n_hw_modules = 0;
      struct autosa_hw_module **hw_modules;
      hw_modules = sa_io_module_gen(info->io_groups[j], gen, &n_hw_modules, 1, 1);

      gen->hw_modules = (struct autosa_hw_module **)realloc(gen->hw_modules,
                                                            (gen->n_hw_modules + n_hw_modules) * sizeof(struct polysa_hw_module *));
      for (int k = 0; k < n_hw_modules; k++)
      {
        gen->hw_modules[gen->n_hw_modules + k] = hw_modules[k];
      }
      gen->n_hw_modules += n_hw_modules;
      if (hw_modules)
        free(hw_modules);
    }    
  }    

  /* Drain module */
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *info = &kernel->array[i];
    if (!info->drain_group)
      continue;
    int n_hw_modules = 0;
    struct autosa_hw_module **hw_modules;    
    hw_modules = sa_io_module_gen(info->drain_group, gen, &n_hw_modules, 0, 1);    

    if (n_hw_modules > 0)
    {
      gen->hw_modules = (struct autosa_hw_module **)realloc(gen->hw_modules,
                                                            (gen->n_hw_modules + n_hw_modules) * sizeof(struct polysa_hw_module *));
      for (int j = 0; j < n_hw_modules; j++)
      {
        gen->hw_modules[gen->n_hw_modules + j] = hw_modules[j];
      }
      gen->n_hw_modules += n_hw_modules;
    }
    if (hw_modules)
      free(hw_modules);
  }    

  /* PE module */
  gen->hw_modules[0] = sa_pe_module_gen(gen);  

  /* Reorder the sequence of the modules. */
  gen->hw_modules = hw_module_reorder(gen->hw_modules, gen->n_hw_modules);

  /* top module */
  struct autosa_hw_top_module *top_module = sa_top_module_gen(gen);
  gen->hw_top_module = top_module;  

  /* Generate drain merge functions. */
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *info = &kernel->array[i];
    if (!info->drain_group)
      continue;
    if (info->n_mem_ports == 1)
      continue;
    struct autosa_drain_merge_func *func =
        generate_drain_merge_func(info->drain_group, kernel, gen);
    gen->drain_merge_funcs = (struct autosa_drain_merge_func **)realloc(
        gen->drain_merge_funcs, (gen->n_drain_merge_funcs + 1) *
                                    sizeof(struct autosa_drain_merge_func *));
    gen->drain_merge_funcs[gen->n_drain_merge_funcs] = func;
    gen->n_drain_merge_funcs++;
  }
}

/* Replace any reference to an array element in the range of "copy"
 * by a reference to all array elements (defined by the extent of the array).
 */
static __isl_give isl_union_map *approximate_copy_out(
    __isl_take isl_union_map *copy, struct autosa_prog *prog)
{
  int i;
  isl_union_map *res;

  res = isl_union_map_empty(isl_union_map_get_space(copy));

  for (i = 0; i < prog->n_array; ++i)
  {
    isl_space *space;
    isl_set *set;
    isl_union_map *copy_i;
    isl_union_set *extent, *domain;

    space = isl_space_copy(prog->array[i].space);
    extent = isl_union_set_from_set(isl_set_universe(space));
    copy_i = isl_union_map_copy(copy);
    copy_i = isl_union_map_intersect_range(copy_i, extent);
    set = isl_set_copy(prog->array[i].extent);
    extent = isl_union_set_from_set(set);
    domain = isl_union_map_domain(copy_i);
    copy_i = isl_union_map_from_domain_and_range(domain, extent);
    res = isl_union_map_union(res, copy_i);
  }

  isl_union_map_free(copy);

  return res;
}

/* Internal data structure for node_may_persist.
 *
 * "tagger" maps tagged iteration domains to the corresponding untagged
 *	iteration domain.
 *
 * "may_persist_flow" is the set of all tagged dataflow dependences
 * with those dependences removed that either precede or follow
 * the kernel launch in a sequence.
 * "inner_band_flow" is the set of all tagged dataflow dependences
 * that are local to a given iteration of the outer band nodes
 * with respect to the current node.
 * "local_flow" is equal to "inner_band_flow", except that the domain
 * and the range have been intersected with intermediate filters
 * on children of sets or sequences.
 */
struct ppcg_may_persist_data
{
  isl_union_pw_multi_aff *tagger;

  isl_union_map *local_flow;
  isl_union_map *inner_band_flow;
  isl_union_map *may_persist_flow;
};

/* Update the information in "data" based on the band ancestor "node".
 *
 * In particular, we restrict the dependences in data->local_flow
 * to those dependence where the source and the sink occur in
 * the same iteration of the given band node.
 * We also update data->inner_band_flow to the new value of
 * data->local_flow.
 */
static int update_may_persist_at_band(__isl_keep isl_schedule_node *node,
                                      struct ppcg_may_persist_data *data)
{
  isl_multi_union_pw_aff *partial;
  isl_union_pw_multi_aff *contraction;
  isl_union_map *flow;

  if (isl_schedule_node_band_n_member(node) == 0)
    return 0;

  partial = isl_schedule_node_band_get_partial_schedule(node);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
                                                               contraction);
  partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
                                                               isl_union_pw_multi_aff_copy(data->tagger));

  flow = data->local_flow;
  flow = isl_union_map_eq_at_multi_union_pw_aff(flow, partial);
  data->local_flow = flow;

  isl_union_map_free(data->inner_band_flow);
  data->inner_band_flow = isl_union_map_copy(data->local_flow);

  return 0;
}

/* Given a set of local reaching domain elements "domain",
 * expand them to the corresponding leaf domain elements using "contraction"
 * and insert the array references tags using data->tagger.
 */
static __isl_give isl_union_set *expand_and_tag(
    __isl_take isl_union_set *domain,
    __isl_take isl_union_pw_multi_aff *contraction,
    struct ppcg_may_persist_data *data)
{
  domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                     contraction);
  domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                     isl_union_pw_multi_aff_copy(data->tagger));
  return domain;
}

/* Given a filter node that is the child of a set or sequence node,
 * restrict data->local_flow to refer only to those elements
 * in the filter of the node.
 * "contraction" maps the leaf domain elements of the schedule tree
 * to the corresponding domain elements at (the parent of) "node".
 */
static int filter_flow(__isl_keep isl_schedule_node *node,
                       struct ppcg_may_persist_data *data,
                       __isl_take isl_union_pw_multi_aff *contraction)
{
  isl_union_set *filter;
  isl_union_map *flow;

  flow = data->local_flow;
  filter = isl_schedule_node_filter_get_filter(node);
  filter = expand_and_tag(filter, contraction, data);
  flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(filter));
  flow = isl_union_map_intersect_range(flow, filter);
  data->local_flow = flow;

  return 0;
}

/* Given a filter node "node", collect the filters on all preceding siblings
 * (which are also filter nodes), add them to "filters" and return the result.
 */
static __isl_give isl_union_set *add_previous_filters(
    __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
{
  isl_schedule_node *sibling;

  sibling = isl_schedule_node_copy(node);
  while (sibling && isl_schedule_node_has_previous_sibling(sibling))
  {
    isl_union_set *filter;

    sibling = isl_schedule_node_previous_sibling(sibling);
    filter = isl_schedule_node_filter_get_filter(sibling);
    filters = isl_union_set_union(filters, filter);
  }
  isl_schedule_node_free(sibling);
  if (!sibling)
    return isl_union_set_free(filters);

  return filters;
}

/* Given a filter node "node", collect the filters on all following siblings
 * (which are also filter nodes), add them to "filters" and return the result.
 */
static __isl_give isl_union_set *add_next_filters(
    __isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
{
  isl_schedule_node *sibling;

  sibling = isl_schedule_node_copy(node);
  while (sibling && isl_schedule_node_has_next_sibling(sibling))
  {
    isl_union_set *filter;

    sibling = isl_schedule_node_next_sibling(sibling);
    filter = isl_schedule_node_filter_get_filter(sibling);
    filters = isl_union_set_union(filters, filter);
  }
  isl_schedule_node_free(sibling);
  if (!sibling)
    return isl_union_set_free(filters);

  return filters;
}

/* Remove those flow dependences from data->may_persist_flow
 * that flow between elements of "domain" within the same iteration
 * of all outer band nodes.
 * "contraction" maps the leaf domain elements of the schedule tree
 * to the corresponding elements "domain".
 */
static void remove_external_flow(struct ppcg_may_persist_data *data,
                                 __isl_take isl_union_set *domain,
                                 __isl_keep isl_union_pw_multi_aff *contraction)
{
  isl_union_map *flow;

  contraction = isl_union_pw_multi_aff_copy(contraction);
  domain = expand_and_tag(domain, contraction, data);
  flow = isl_union_map_copy(data->local_flow);
  flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(domain));
  flow = isl_union_map_intersect_range(flow, domain);

  data->may_persist_flow = isl_union_map_subtract(data->may_persist_flow,
                                                  flow);
}

/* Update the information in "data" based on the filter ancestor "node".
 * We only need to modify anything if the filter is the child
 * of a set or sequence node.
 *
 * In the case of a sequence, we remove the dependences between
 * statement instances that are both executed either before or
 * after the subtree that will be mapped to a kernel, within
 * the same iteration of outer bands.
 *
 * In both cases, we restrict data->local_flow to the current child.
 */
static int update_may_persist_at_filter(__isl_keep isl_schedule_node *node,
                                        struct ppcg_may_persist_data *data)
{
  enum isl_schedule_node_type type;
  isl_schedule_node *parent;
  isl_space *space;
  isl_union_pw_multi_aff *contraction;
  isl_union_set *before, *after, *filter;

  type = isl_schedule_node_get_parent_type(node);
  if (type != isl_schedule_node_sequence && type != isl_schedule_node_set)
    return 0;

  parent = isl_schedule_node_copy(node);
  parent = isl_schedule_node_parent(parent);
  contraction = isl_schedule_node_get_subtree_contraction(parent);
  isl_schedule_node_free(parent);

  if (type == isl_schedule_node_set)
    return filter_flow(node, data, contraction);

  filter = isl_schedule_node_filter_get_filter(node);
  space = isl_union_set_get_space(filter);
  isl_union_set_free(filter);
  before = isl_union_set_empty(space);
  after = isl_union_set_copy(before);
  before = add_previous_filters(before, node);
  after = add_next_filters(after, node);

  remove_external_flow(data, before, contraction);
  remove_external_flow(data, after, contraction);

  return filter_flow(node, data, contraction);
}

/* Update the information in "data" based on the ancestor "node".
 */
static isl_stat update_may_persist_at(__isl_keep isl_schedule_node *node,
                                      void *user)
{
  struct ppcg_may_persist_data *data = (struct ppcg_may_persist_data *)user;

  switch (isl_schedule_node_get_type(node))
  {
  case isl_schedule_node_error:
    return isl_stat_error;
  case isl_schedule_node_context:
  case isl_schedule_node_domain:
  case isl_schedule_node_expansion:
  case isl_schedule_node_extension:
  case isl_schedule_node_guard:
  case isl_schedule_node_leaf:
  case isl_schedule_node_mark:
  case isl_schedule_node_sequence:
  case isl_schedule_node_set:
    break;
  case isl_schedule_node_band:
    if (update_may_persist_at_band(node, data) < 0)
      return isl_stat_error;
    break;
  case isl_schedule_node_filter:
    if (update_may_persist_at_filter(node, data) < 0)
      return isl_stat_error;
    break;
  }

  return isl_stat_ok;
}

/* Determine the set of array elements that may need to be perserved
 * by a kernel constructed from the subtree at "node".
 * This includes the set of array elements that may need to be preserved
 * by the entire scop (prog->may_persist) and the elements for which
 * there is a potential flow dependence that may cross a kernel launch.
 *
 * To determine the second set, we start from all flow dependences.
 * From this set of dependences, we remove those that cannot possibly
 * require data to be preserved by a kernel launch.
 * In particular, we consider the following sets of dependences.
 * - dependences of which the write occurs inside the kernel.
 *   If the data is needed outside the kernel, then it will
 *   be copied out immediately after the kernel launch, so there
 *   is no need for any special care.
 * - dependences of which the read occurs inside the kernel and the
 *   corresponding write occurs inside the same iteration of the
 *   outer band nodes.  This means that the data is needed in
 *   the first kernel launch after the write, which is already
 *   taken care of by the standard copy-in.  That is, the data
 *   do not need to be preserved by any intermediate call to
 *   the same kernel.
 * - dependences of which the write and the read either both occur
 *   before the kernel launch or both occur after the kernel launch,
 *   within the same iteration of the outer band nodes with respect
 *   to the sequence that determines the ordering of the dependence
 *   and the kernel launch.  Such flow dependences cannot cross
 *   any kernel launch.
 *
 * For the remaining (tagged) dependences, we take the domain
 * (i.e., the tagged writes) and apply the tagged access relation
 * to obtain the accessed data elements.
 * These are then combined with the elements that may need to be
 * preserved by the entire scop.
 */
static __isl_give isl_union_set *node_may_persist(
    __isl_keep isl_schedule_node *node, struct autosa_prog *prog)
{
  struct ppcg_may_persist_data data;
  isl_union_pw_multi_aff *contraction;
  isl_union_set *domain;
  isl_union_set *persist;
  isl_union_map *flow, *local_flow;

  data.tagger = prog->scop->tagger;

  flow = isl_union_map_copy(prog->scop->tagged_dep_flow);
  data.local_flow = isl_union_map_copy(flow);
  data.inner_band_flow = isl_union_map_copy(flow);
  data.may_persist_flow = flow;
  if (isl_schedule_node_foreach_ancestor_top_down(node,
                                                  &update_may_persist_at, &data) < 0)
    data.may_persist_flow =
        isl_union_map_free(data.may_persist_flow);
  flow = data.may_persist_flow;
  isl_union_map_free(data.local_flow);

  domain = isl_schedule_node_get_domain(node);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                     contraction);
  domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                     isl_union_pw_multi_aff_copy(data.tagger));
  /* Substract the case 1. */
  flow = isl_union_map_subtract_domain(flow, isl_union_set_copy(domain));
  local_flow = data.inner_band_flow;
  local_flow = isl_union_map_intersect_range(local_flow, domain);
  /* Substract the case 2. */
  flow = isl_union_map_subtract(flow, local_flow);

  persist = isl_union_map_domain(flow);
  persist = isl_union_set_apply(persist,
                                isl_union_map_copy(prog->scop->tagged_may_writes));
  persist = isl_union_set_union(persist,
                                isl_union_set_copy(prog->may_persist));

  return persist;
}

/* Return (the universe spaces of) the arrays that are declared
 * inside the scop corresponding to "prog" and for which all
 * potential writes inside the scop form a subset of "domain".
 */
static __isl_give isl_union_set *extract_local_accesses(struct autosa_prog *prog,
                                                        __isl_keep isl_union_set *domain)
{
  int i;
  isl_union_set *local;

  local = isl_union_set_empty(isl_union_set_get_space(domain));

  for (i = 0; i < prog->n_array; ++i)
  {
    isl_set *set;
    isl_union_map *to_outer;
    isl_union_map *may_write;
    isl_union_set *write_domain;
    isl_union_set *fields;
    int subset;

    if (!prog->array[i].local)
      continue;

    set = isl_set_universe(isl_space_copy(prog->array[i].space));
    to_outer = isl_union_map_copy(prog->to_outer);
    to_outer = isl_union_map_intersect_range(to_outer,
                                             isl_union_set_from_set(isl_set_copy(set)));
    fields = isl_union_map_domain(to_outer);
    may_write = isl_union_map_copy(prog->may_write);
    may_write = isl_union_map_intersect_range(may_write, fields);
    write_domain = isl_union_map_domain(may_write);
    subset = isl_union_set_is_subset(write_domain, domain);
    isl_union_set_free(write_domain);

    if (subset < 0)
    {
      isl_set_free(set);
      return isl_union_set_free(local);
    }
    else if (subset)
    {
      local = isl_union_set_add_set(local, set);
    }
    else
    {
      isl_set_free(set);
    }
  }

  return local;
}

/* For each array in "prog" of which an element appears in "accessed" and
 * that is not a read only scalar, create a zero-dimensional universe set
 * of which the tuple id has name "<prefix>_<name of array>" and a user
 * pointer pointing to the array (autosa_array_info).
 *
 * If the array is local to "prog", then make sure it will be declared
 * in the host code.
 *
 * Return the list of these universe sets.
 */
static __isl_give isl_union_set_list *create_copy_filters(struct autosa_prog *prog,
                                                          const char *prefix, __isl_take isl_union_set *accessed)
{
  int i;
  isl_ctx *ctx;
  isl_union_set_list *filters;

  ctx = prog->ctx;
  filters = isl_union_set_list_alloc(ctx, 0);
  for (i = 0; i < prog->n_array; ++i)
  {
    struct autosa_array_info *array = &prog->array[i];
    isl_space *space;
    isl_set *accessed_i;
    int empty;
    char *name;
    isl_id *id;
    isl_union_set *uset;

    if (autosa_array_is_read_only_scalar(array))
      continue;

    space = isl_space_copy(array->space);
    accessed_i = isl_union_set_extract_set(accessed, space);
    empty = isl_set_plain_is_empty(accessed_i);
    isl_set_free(accessed_i);
    if (empty < 0)
    {
      filters = isl_union_set_list_free(filters);
      break;
    }
    if (empty)
      continue;

    array->global = 1;
    array->local_array->global = 1;
    if (array->local)
      array->declare_local = 1;
    if (!strcmp(prefix, "to_device"))
      array->copy_in = 1;
    if (!strcmp(prefix, "from_device"))
      array->copy_out = 1;

    name = concat(ctx, prefix, array->name);
    id = name ? isl_id_alloc(ctx, name, array) : NULL;
    free(name);
    space = isl_space_set_alloc(ctx, 0, 0);
    space = isl_space_set_tuple_id(space, isl_dim_set, id);
    uset = isl_union_set_from_set(isl_set_universe(space));

    filters = isl_union_set_list_add(filters, uset);
  }
  isl_union_set_free(accessed);

  return filters;
}

/* Return the set of parameter values for which the array has a positive
 * size in all dimensions.
 * If the sizes are only valid for some parameter values, then those
 * constraints are also taken into account.
 */
__isl_give isl_set *autosa_array_positive_size_guard(struct autosa_array_info *array)
{
  int i;
  isl_space *space;
  isl_set *guard;

  if (!array)
    return NULL;

  space = isl_space_params(isl_space_copy(array->space));
  guard = isl_set_universe(space);

  for (i = 0; i < array->n_index; ++i)
  {
    isl_pw_aff *bound;
    isl_set *guard_i, *zero;

    bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
    guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
    zero = isl_pw_aff_zero_set(bound);
    guard_i = isl_set_subtract(guard_i, zero);
    guard = isl_set_intersect(guard, guard_i);
  }

  return guard;
}

/* Make sure that code for the statements in "filters" that
 * copy arrays to or from the device is only generated when
 * the size of the corresponding array is positive.
 * That is, add a set node underneath "graft" with "filters" as children
 * and for each child add a guard that the selects the parameter
 * values for which the corresponding array has a positive size.
 * The array is available in the user pointer of the statement identifier.
 * "depth" is the schedule depth of the position where "graft"
 * will be added.
 */
static __isl_give isl_schedule_node *insert_positive_size_guards(
    __isl_take isl_schedule_node *graft,
    __isl_take isl_union_set_list *filters, int depth)
{
  int i, n;

  graft = isl_schedule_node_child(graft, 0);
  graft = isl_schedule_node_insert_set(graft, filters);
  n = isl_schedule_node_n_children(graft);
  for (i = 0; i < n; ++i)
  {
    isl_union_set *filter;
    isl_set *domain, *guard;
    isl_id *id;
    struct autosa_array_info *array;

    graft = isl_schedule_node_child(graft, i);
    filter = isl_schedule_node_filter_get_filter(graft);
    domain = isl_set_from_union_set(filter);
    id = isl_set_get_tuple_id(domain);
    array = (struct autosa_array_info *)isl_id_get_user(id);
    isl_id_free(id);
    isl_set_free(domain);
    guard = autosa_array_positive_size_guard(array);
    guard = isl_set_from_params(guard);
    guard = isl_set_add_dims(guard, isl_dim_set, depth);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_guard(graft, guard);
    graft = isl_schedule_node_parent(graft);
    graft = isl_schedule_node_parent(graft);
  }
  graft = isl_schedule_node_parent(graft);

  return graft;
}

/* Create a graft for copying arrays to or from the device,
 * whenever the size of the array is strictly positive.
 * Each statement is called "<prefix>_<name of array>" and
 * the identifier has a user pointer pointing to the array.
 * The graft will be added at the position specified by "node".
 * "copy" contains the array elements that need to be copied.
 * Only arrays of which some elements need to be copied
 * will have a corresponding statement in the graph.
 * Note though that each such statement will copy the entire array.
 */
static __isl_give isl_schedule_node *create_copy_device(struct autosa_prog *prog,
                                                        __isl_keep isl_schedule_node *node, const char *prefix,
                                                        __isl_take isl_union_set *copy)
{
  int depth;
  isl_ctx *ctx;
  isl_space *space;
  isl_union_set *all, *domain;
  isl_union_set_list *filters;
  isl_union_map *extension;
  isl_schedule_node *graft;

  ctx = prog->ctx;
  depth = isl_schedule_node_get_schedule_depth(node);
  filters = create_copy_filters(prog, prefix, copy);
  all = isl_union_set_list_union(isl_union_set_list_copy(filters));

  space = depth < 0 ? NULL : isl_space_set_alloc(ctx, 0, depth);
  domain = isl_union_set_from_set(isl_set_universe(space));
  extension = isl_union_map_from_domain_and_range(domain, all);
  graft = isl_schedule_node_from_extension(extension);

  if (!filters)
    return isl_schedule_node_free(graft);
  if (isl_union_set_list_n_union_set(filters) == 0)
  {
    isl_union_set_list_free(filters);
    return graft;
  }

  return insert_positive_size_guards(graft, filters, depth);
}

/* Add nodes for copying outer arrays in and out of the device
 * before and after the subtree "node", which contains one or more kernels.
 * "domain" contains the original statement instances, i.e.,
 * those that correspond to the domains of the access relations in "prog".
 * In particular, the domain has not been contracted in any way.
 * "prefix" contains the prefix schedule at that point, in terms
 * of the same original statement instances.
 *
 * We first compute the sets of outer array elements that need
 * to be copied in and out and then graft in the nodes for
 * performing this copying.
 *
 * In particular, for each array that is possibly written anywhere in
 * the subtree "node" and that may be used after "node"
 * or that may be visible outside the corresponding scop,
 * we copy out its entire extent.
 *
 * Any array elements that is read without first being written inside
 * the subtree "node" needs to be copied in.
 * Furthermore, if there are any array elements that
 * are copied out, but that may not be written inside "node", then
 * they also need to be copied in to ensure that the value after execution
 * is the same as the value before execution, at least for those array
 * elements that may have their values preserved by the scop or that
 * may be written before "node" and read after "node".
 * In case the array elements are structures, we need to take into
 * account that all members of the structures need to be written
 * by "node" before we can avoid copying the data structure in.
 *
 * Note that the may_write relation is intersected with the domain,
 * which has been intersected with the context.
 * This helps in those cases where the arrays are declared with a fixed size,
 * while the accesses are parametric and the context assigns a fixed value
 * to the parameters.
 *
 * If an element from a local array is read without first being written,
 * then there is no point in copying it in since it cannot have been
 * written prior to the scop. Warn about the uninitialized read instead.
 */
__isl_give isl_schedule_node *sa_add_to_from_device(
    __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain,
    __isl_take isl_union_map *prefix, struct autosa_prog *prog)
{
  isl_union_set *local;
  isl_union_set *may_persist;
  isl_union_map *may_write, *must_write, *copy_out, *not_written;
  isl_union_map *read, *copy_in;
  isl_union_map *tagged;
  isl_union_map *local_uninitialized;
  isl_schedule_node *graft;

  /* Compute the copy-out that contains the live-out union
   * domain of non-local flow dep. 
   */
  tagged = isl_union_map_copy(prog->scop->tagged_reads);
  tagged = isl_union_map_union(tagged,
                               isl_union_map_copy(prog->scop->tagged_may_writes));
  may_write = isl_union_map_copy(prog->may_write);
  may_write = isl_union_map_intersect_domain(may_write,
                                             isl_union_set_copy(domain));
  /* Keep only the live-out union domain of non-local flow. */
  may_write = remove_local_accesses(prog,
                                    isl_union_map_copy(tagged), may_write,
                                    isl_union_map_copy(prefix), 0);
  may_write = isl_union_map_apply_range(may_write,
                                        isl_union_map_copy(prog->to_outer));
  may_write = isl_union_map_apply_domain(may_write,
                                         isl_union_map_copy(prefix));
  may_write = approximate_copy_out(may_write, prog);
  copy_out = isl_union_map_copy(may_write);

  /* Compute the copy-in. */
  may_write = isl_union_map_apply_range(may_write,
                                        isl_union_map_copy(prog->to_inner));
  must_write = isl_union_map_copy(prog->must_write);
  must_write = isl_union_map_apply_domain(must_write,
                                          isl_union_map_copy(prefix));

  may_persist = node_may_persist(node, prog);
  may_write = isl_union_map_intersect_range(may_write, may_persist);
  not_written = isl_union_map_subtract(may_write, must_write);

  /* Detect the unitialized reads. */
  /* "local" contains (universal space) of arrays that are declared locally and 
   * written by "domain". */
  local = extract_local_accesses(prog, domain);
  local = isl_union_set_apply(local, isl_union_map_copy(prog->to_inner));
  local_uninitialized = isl_union_map_copy(prog->scop->live_in);
  /* The local unitialized is defined as a read of a local array without 
   * first being written. */
  local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
                                                      local);
  read = isl_union_map_copy(prog->read);
  read = isl_union_map_intersect_domain(read, domain);
  read = remove_local_accesses(prog, tagged, read,
                               isl_union_map_copy(prefix), 1);
  local_uninitialized = isl_union_map_intersect(local_uninitialized,
                                                isl_union_map_copy(read));
  if (!isl_union_map_is_empty(local_uninitialized))
  {
    fprintf(stderr,
            "possibly uninitialized reads (not copied in):\n");
    isl_union_map_dump(local_uninitialized);
  }
  read = isl_union_map_subtract(read, local_uninitialized);
  read = isl_union_map_apply_domain(read, prefix);
  copy_in = isl_union_map_union(read, not_written);
  copy_in = isl_union_map_apply_range(copy_in,
                                      isl_union_map_copy(prog->to_outer));

  /* Add in the copy-in/copy-out nodes. */
  graft = create_copy_device(prog, node, "to_device",
                             isl_union_map_range(copy_in));
  node = isl_schedule_node_graft_before(node, graft);
  graft = create_copy_device(prog, node, "from_device",
                             isl_union_map_range(copy_out));
  node = isl_schedule_node_graft_after(node, graft);

  return node;
}

/* Add nodes for initializing ("init_device") and clearing ("clear_device")
 * the device before and after "node".
 */
__isl_give isl_schedule_node *sa_add_init_clear_device(
    __isl_take isl_schedule_node *node, struct autosa_kernel *kernel)
{
  isl_ctx *ctx;
  isl_space *space;
  isl_union_set *domain;
  isl_schedule_node *graft;
  isl_id *id;

  ctx = isl_schedule_node_get_ctx(node);

  space = isl_space_set_alloc(ctx, 0, 0);
  id = isl_id_alloc(ctx, "init_device", kernel);
  //space = isl_space_set_tuple_name(space, isl_dim_set, "init_device");
  space = isl_space_set_tuple_id(space, isl_dim_set, id);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft = isl_schedule_node_from_domain(domain);

  node = isl_schedule_node_graft_before(node, graft);

  space = isl_space_set_alloc(ctx, 0, 0);
  id = isl_id_alloc(ctx, "clear_device", kernel);
  //space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device");
  space = isl_space_set_tuple_id(space, isl_dim_set, id);
  domain = isl_union_set_from_set(isl_set_universe(space));
  graft = isl_schedule_node_from_domain(domain);

  node = isl_schedule_node_graft_after(node, graft);

  return node;
}

__isl_give isl_schedule_node *sa_add_drain_merge(
    __isl_take isl_schedule_node *node, struct autosa_gen *gen)
{
  isl_ctx *ctx;

  ctx = isl_schedule_node_get_ctx(node);
  for (int i = 0; i < gen->n_drain_merge_funcs; i++)
  {
    isl_id *id;
    isl_space *space;
    isl_union_set *domain;
    isl_schedule_node *graft;
    struct autosa_drain_merge_func *func = gen->drain_merge_funcs[i];
    struct autosa_array_ref_group *group = func->group;
    if (group->local_array->n_mem_ports == 1)
      continue;
    space = isl_space_set_alloc(ctx, 0, 0);
    id = isl_id_alloc(ctx, "drain_merge", func);
    space = isl_space_set_tuple_id(space, isl_dim_set, id);
    domain = isl_union_set_from_set(isl_set_universe(space));
    graft = isl_schedule_node_from_domain(domain);
    node = isl_schedule_node_graft_after(node, graft);
  }

  return node;
}

/***************************************************************
 * AST Codegen
 ***************************************************************/
/* Internal data structure for at_domain.
 * "prog" represents the entire scop.
 * "kernel" points to the kernel to which the current schedule node
 * belongs. It is set by before_mark and reset by after_mark.
 * It may be NULL if we are outside any kernel.
 */
struct autosa_at_domain_data
{
  struct autosa_prog *prog;
  struct autosa_kernel *kernel;
  struct autosa_hw_module *module;
  struct autosa_hw_top_module *top;
  struct autosa_pe_dummy_module *pe_dummy_module;
  struct autosa_drain_merge_func *drain_merge_func;
  int filter_buffer;
  int boundary;
  int pe_dummy;
  /* In the tuning mode. */
  int tuning;
  int tuning_num;

  /* Under a "pipeline" mark */
  int under_pipeline;
  /* Under a "unroll" mark */
  int under_unroll;
  /* Inside a "pipeline" for loop */
  int in_pipeline_for;
  /* Inside a "unroll" for loop */
  int in_unroll_for;
  /* Inside a for loop */
  int in_for;
};

/* Internal data structure for the index and AST expression transformation
 * callbacks for pet_stmt_build_ast_exprs.
 *
 * "kernel" is the kernel for which are computing AST expressions and
 * may be NULL if we are not inside a kernel.
 * "accesses" is the list of polysa_stmt_access in the statement.
 * "iterator_map" expresses the statement iterators in terms of
 * the AST loop iterators.
 * "sched2copy" expresses the outer copy_schedule_dim dimensions of
 * the kernel schedule in terms of the AST loop iterators and
 * may be NULL if we are not inside a kernel.
 *
 * The following fields are set in transform_index and used in transform_expr.
 * "array" is the array that is being accessed.
 * "global" is set if the global array is accessed (rather than
 * shared/private memory).
 * "local_array" refers to information on the array specialized
 * to the current kernel.
 */
struct autosa_transform_data
{
  struct autosa_kernel *kernel;
  struct autosa_stmt_access *accesses;
  isl_pw_multi_aff *iterator_map;
  isl_pw_multi_aff *sched2copy;

  struct autosa_array_info *array;
  int global;
  int reg;
  struct autosa_local_array_info *local_array;
  struct autosa_array_ref_group *group;
};

/* Set *depth (initialized to 0 by the caller) to the maximum
 * of the schedule depths of the leaf nodes for which this function is called.
 */
static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
{
  int *depth = (int *)user;
  int node_depth;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return isl_bool_true;
  node_depth = isl_schedule_node_get_schedule_depth(node);
  if (node_depth > *depth)
    *depth = node_depth;

  return isl_bool_false;
}

/* Given a mapping "iterator_map" from the AST schedule to a domain,
 * return the corresponding mapping from the AST schedule
 * to the outer kernel->copy_schedule_dim dimensions of
 * the schedule computed by AutoSA for this kernel.
 *
 * Note that kernel->copy_schedule_dim is at least as large as
 * the largest depth of any array reference group associated to the kernel.
 * This is needed as the returned schedule is used to extract a mapping
 * to the outer tile->depth dimensions in transform_index.
 */
static __isl_give isl_pw_multi_aff *compute_sched_to_copy(
    struct autosa_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map)
{
  isl_union_pw_multi_aff *upma;
  isl_pw_multi_aff *pma;
  isl_space *space;

  space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
  space = isl_space_from_domain(space);
  space = isl_space_add_dims(space, isl_dim_out,
                             kernel->copy_schedule_dim);

  upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule);
  pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space);
  isl_union_pw_multi_aff_free(upma);

  return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map);
}

/* Return the autosa_stmt_access in the list "accesses" that corresponds
 * to "ref_id".
 */
static struct autosa_stmt_access *find_access(struct autosa_stmt_access *accesses,
                                              __isl_keep isl_id *ref_id)
{
  struct autosa_stmt_access *access;

  for (access = accesses; access; access = access->next)
    if (access->ref_id == ref_id)
      return access;

  return NULL;
}

/* Return the name of the outer array (of structs) accessed by "access".
 */
static const char *get_outer_array_name(__isl_keep isl_map *access)
{
  isl_space *space;
  const char *name;

  space = isl_space_range(isl_map_get_space(access));
  while (space && isl_space_is_wrapping(space))
    space = isl_space_domain(isl_space_unwrap(space));
  name = isl_space_get_tuple_name(space, isl_dim_set);
  isl_space_free(space);

  return name;
}

/* Return the index of the array called "name" in the list of arrays.
 */
static int find_array_index(struct autosa_kernel *kernel, const char *name)
{
  int i;

  for (i = 0; i < kernel->n_array; ++i)
    if (!strcmp(name, kernel->array[i].array->name))
      return i;

  return -1;
}

/* Return a pointer to the autosa_array_ref_group in "local"
 * that contains the reference "access".
 * Return NULL if no such group can be found.
 */
static struct autosa_array_ref_group *find_ref_group(
    struct autosa_local_array_info *local, struct autosa_stmt_access *access)
{
  int i, j;

  for (i = 0; i < local->n_group; ++i)
  {
    struct autosa_array_ref_group *group = local->groups[i];

    for (j = 0; j < group->n_ref; ++j)
      if (group->refs[j] == access)
        return group;
  }

  return NULL;
}

/* Given a mapping "iterator_map" from the AST schedule to a domain,
 * return the corresponding mapping from the AST schedule
 * to the outer group->copy_schedule_dim dimensions of
 * the schedule computed by AutoSA for this kernel.
 *
 * Note that group->copy_schedule_dim is at least as large as
 * the largest depth of any array references associated to the group.
 * This is needed as the returned schedule is used to extract a mapping
 * to the outer tile->depth dimensions in transform_index.
 */
static __isl_give isl_pw_multi_aff *compute_sched_to_copy_group(
    __isl_take isl_pw_multi_aff *iterator_map,
    struct autosa_array_ref_group *group)
{
  isl_union_pw_multi_aff *upma;
  isl_pw_multi_aff *pma;
  isl_space *space;

  space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
  space = isl_space_from_domain(space);
  space = isl_space_add_dims(space, isl_dim_out,
                             group->copy_schedule_dim);

  upma = isl_union_pw_multi_aff_copy(group->copy_schedule);
  pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space);
  isl_union_pw_multi_aff_free(upma);

  return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map);
}

/* Given an index expression "index" of the form
 *
 *	L -> F(A),
 *
 * with F(A) either A or some subfield of A and L the AST loop iterators,
 * and a tiling "tiling" of the form
 *
 *	[L -> A] -> T
 *
 * apply the tiling to the outer array in the index expression to obtain
 *
 *	L -> T(A)
 *
 * If F(A) is some subfield of A, then separate the member access
 * into the base index expression and the field index expression,
 * apply the tiling to the base index expression and combine the result
 * with the field index expression.
 *
 * If F(A) is A, then modify index to keep track of the iterators
 *
 *	L -> [L -> A]
 *
 * and combine the result with the tiling to obtain a tiled index expression
 * in terms of the AST loop iterators
 *
 *	L -> T
 */
static __isl_give isl_multi_pw_aff *tile_outer(
    __isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling)
{
  isl_bool is_wrapping;
  isl_space *space;
  isl_multi_pw_aff *mpa;

  is_wrapping = isl_multi_pw_aff_range_is_wrapping(index);
  if (is_wrapping < 0)
    goto error;
  if (is_wrapping)
  {
    isl_multi_pw_aff *field;

    field = isl_multi_pw_aff_copy(index);
    field = isl_multi_pw_aff_range_factor_range(field);
    index = isl_multi_pw_aff_range_factor_domain(index);
    index = tile_outer(index, tiling);
    return isl_multi_pw_aff_range_product(index, field);
  }

  space = isl_space_domain(isl_multi_pw_aff_get_space(index));
  space = isl_space_map_from_set(space);
  mpa = isl_multi_pw_aff_identity(space);
  index = isl_multi_pw_aff_range_product(mpa, index);
  index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);

  return index;
error:
  isl_multi_pw_aff_free(index);
  isl_multi_pw_aff_free(tiling);
  return NULL;
}

/* Index transformation callback for pet_stmt_build_ast_exprs.
 *
 * "index" expresses the array indices in terms of statement iterators
 *
 * We first reformulate "index" in terms of the AST loop iterators.
 * Then we check if we are accessing the global array or
 * a shared/private copy.  In particular, if we are not inside a kernel
 * then we must be accessing a global array.
 * In the former case, we simply return
 * the updated index.  If "index" is an affine expression rather
 * than an array access, then we also return the updated index here.
 *
 * If no reference groups have been computed for the array,
 * then we can only be accessing the global array.
 *
 * Otherwise, we apply the tiling to the index.
 * This tiling is of the form
 *
 *	[D -> A] -> T
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule.
 * The index is of the form
 *
 *	L -> A
 *
 * We update the tiling to refer to the AST loop iterators
 *
 *	[L -> A] -> T
 *
 * and combine it with the index to obtain a tiled index expression in terms
 * of the AST loop iterators
 *
 *	L -> T
 *
 * Note that while the tiling applies directly to an outer array.
 * the index may refer to some subfield of this outer array.
 * In such cases, the result will refer to the same subfield of the tile.
 * That is, an index expression of the form  L -> F(A) will be transformed
 * into an index expression of the form L -> F(T).
 */
static __isl_give isl_multi_pw_aff *transform_index(
    __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
    void *user)
{
  struct autosa_transform_data *data = (struct autosa_transform_data *)user;
  struct autosa_stmt_access *access;
  struct autosa_array_ref_group *group;
  struct autosa_array_tile *tile;
  isl_pw_multi_aff *iterator_map;
  int i;
  int dim;
  const char *name;
  isl_space *space;
  isl_multi_pw_aff *tiling;
  isl_pw_multi_aff *pma;
  isl_pw_multi_aff *sched2depth;
  isl_pw_multi_aff *sched2copy;

  data->array = NULL;

  iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
  index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);

  if (!data->kernel)
    return index;

  access = find_access(data->accesses, ref_id);
  if (!access)
    return index;
  if (!isl_map_has_tuple_name(access->access, isl_dim_out))
    return index;

  name = get_outer_array_name(access->access);
  if (!name)
    return isl_multi_pw_aff_free(index);
  i = find_array_index(data->kernel, name);
  if (i < 0)
    isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
            "cannot find array",
            return isl_multi_pw_aff_free(index));
  data->local_array = &data->kernel->array[i];
  data->array = data->local_array->array;
  group = find_ref_group(data->local_array, access);
  data->group = group;
  if (!group)
  {
    data->global = 1;
    data->reg = 1;
    return index;
  }

  tile = autosa_array_ref_group_tile(group);
  data->global = !tile;
  data->reg = !tile;
  if (!tile)
    return index;

  /* recompute the sched2copy for each index. */
  if (group->group_type == AUTOSA_PE_GROUP) {
    //std::cout << "guard begin" << std::endl;
    sched2copy = compute_sched_to_copy_group(isl_pw_multi_aff_copy(data->iterator_map), group);
    //std::cout << "guard end" << std::endl;
  }

  space = isl_space_domain(isl_multi_aff_get_space(tile->tiling));
  space = isl_space_range(isl_space_unwrap(space));
  space = isl_space_map_from_set(space);
  pma = isl_pw_multi_aff_identity(space);
  if (group->group_type == AUTOSA_PE_GROUP) {
    sched2depth = sched2copy;
  } else {
    sched2depth = isl_pw_multi_aff_copy(data->sched2copy);
  }
  dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out);
  sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out,
                                           tile->depth, dim - tile->depth);
  pma = isl_pw_multi_aff_product(sched2depth, pma);
  tiling = isl_multi_pw_aff_from_multi_aff(
      isl_multi_aff_copy(tile->tiling));
  tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);

  index = tile_outer(index, tiling);

  return index;
}

/* Dereference "expr" by adding an index [0].
 * The original "expr" is assumed not to have any indices.
 *
 * If "expr" is a member access, then the dereferencing needs
 * to be applied to the structure argument of this member access.
 */
static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
{
  isl_ctx *ctx;
  isl_ast_expr *arg0, *res;
  isl_ast_expr_list *list;

  arg0 = isl_ast_expr_get_op_arg(expr, 0);
  if (!arg0)
    return isl_ast_expr_free(expr);
  if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
      isl_ast_expr_get_op_type(arg0) == isl_ast_op_member)
  {
    isl_ast_expr *arg;

    arg = isl_ast_expr_get_op_arg(arg0, 0);
    arg = dereference(arg);
    arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
    expr = isl_ast_expr_set_op_arg(expr, 0, arg0);

    return expr;
  }
  isl_ast_expr_free(arg0);

  ctx = isl_ast_expr_get_ctx(expr);
  res = isl_ast_expr_from_val(isl_val_zero(ctx));
  list = isl_ast_expr_list_from_ast_expr(res);
  res = isl_ast_expr_get_op_arg(expr, 0);
  res = isl_ast_expr_access(res, list);
  isl_ast_expr_free(expr);

  return res;
}

/* Linearize the index expression "expr" based on the array bounds
 * of "array".
 *
 * That is, transform expression
 *
 *	A[i_0][i_1]...[i_n]
 *
 * to
 *
 *	A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
 *
 * where b_0, b_1, ..., b_n are the bounds on the array.
 *
 * If the base of "expr" is a member access, then the linearization needs
 * to be applied to the structure argument of this member access.
 *
 * In the base case, if "expr" has no arguments (other than the name of
 * the array), then we are passing an entire array to a function.
 * In this case, there is nothing to linearize.
 * Note that at this point an expression with no arguments can
 * only be an entire array because the scalar case and
 * the case of single struct are handled by the caller.
 *
 * If the number of specified index expressions in "expr"
 * is smaller than the dimension of the accessed array,
 * then the missing i_j also do not appear in the linearized expression.
 * Furthermore, since such an expression does not refer to a single
 * element while the default linearized expression would refer to
 * a single element, we return the expression
 *
 *	A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l)
 *
 * instead.  Note that because of the special case handling above,
 * we can assume here that there is at least one index expression.
 */
__isl_give isl_ast_expr *autosa_local_array_info_linearize_index(
    struct autosa_local_array_info *array, __isl_take isl_ast_expr *expr)
{
  int i, n;
  isl_ast_expr *arg0;
  isl_ast_expr *res;
  isl_ast_expr_list *list;

  arg0 = isl_ast_expr_get_op_arg(expr, 0);
  if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
      isl_ast_expr_get_op_type(arg0) == isl_ast_op_member)
  {
    isl_ast_expr *arg;

    arg = isl_ast_expr_get_op_arg(arg0, 0);
    arg = autosa_local_array_info_linearize_index(array, arg);
    arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
    expr = isl_ast_expr_set_op_arg(expr, 0, arg0);

    return expr;
  }
  isl_ast_expr_free(arg0);

  if (isl_ast_expr_get_op_n_arg(expr) == 1)
    return expr;

  n = isl_ast_expr_get_op_n_arg(expr);
  res = isl_ast_expr_get_op_arg(expr, 1);
  for (i = 1; i < array->n_index; ++i)
  {
    isl_ast_expr *expr_i;

    expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
    res = isl_ast_expr_mul(res, expr_i);

    if (i + 1 >= n)
      continue;
    expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
    res = isl_ast_expr_add(res, expr_i);
  }

  if (1 + array->n_index > n)
  {
    res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
  }
  else
  {
    list = isl_ast_expr_list_from_ast_expr(res);
    res = isl_ast_expr_get_op_arg(expr, 0);
    res = isl_ast_expr_access(res, list);
  }

  isl_ast_expr_free(expr);

  return res;
}

/* AST expression transformation callback for pet_stmt_build_ast_exprs.
 *
 * If the AST expression refers to an array that is not accessed
 * at all, then this means the value of the expression is not used,
 * so we might as well print zero (NULL pointer) instead.
 *
 * If the AST expression refers to a global scalar that is not
 * a read-only scalar, then its address was passed to the kernel and
 * we need to dereference it.
 *
 * If the AST expression refers to an access to a global array,
 * then we linearize the access exploiting the bounds in data->local_array.
 */
static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
                                               __isl_keep isl_id *id, void *user)
{
  struct autosa_transform_data *data = (struct autosa_transform_data *)user;

  if (!data->array)
    return expr;

  if (!data->array->accessed)
  {
    isl_ctx *ctx;

    ctx = isl_ast_expr_get_ctx(expr);
    isl_ast_expr_free(expr);
    return isl_ast_expr_from_val(isl_val_zero(ctx));
  }
  if (autosa_array_is_read_only_scalar(data->array))
    return expr;
  if (!data->global)
    return expr;
  if (data->array->n_index == 0)
    return dereference(expr);
  if (!data->array->linearize)
    return expr;

  return autosa_local_array_info_linearize_index(data->local_array, expr);
}

/* This function is called for each instance of a user statement
 * in the kernel "kernel", identified by "autosa_stmt".
 * "kernel" may be NULL if we are not inside a kernel.
 *
 * We attach a struct autosa_kernel_stmt to the "node", containing
 * a computed AST expression for each access, through an annotation
 * with name "user".
 * These AST expressions are computed from iterator_map,
 * which expresses the domain elements in terms of the generated loops, 
 * and sched2copy, which expresses the outer copy_schedule_dim dimensions of
 * the kernel schedule computed by AutoSA in terms of the generated loops.
 */
static __isl_give isl_ast_node *create_domain_leaf(
    struct autosa_kernel *kernel, __isl_take isl_ast_node *node,
    __isl_keep isl_ast_build *build, struct autosa_stmt *autosa_stmt)
{
  struct autosa_transform_data data;
  struct autosa_kernel_stmt *stmt;
  isl_ctx *ctx;
  isl_id *id;
  isl_pw_multi_aff *sched2copy;
  isl_map *map;
  isl_pw_multi_aff *iterator_map;
  isl_union_map *schedule;

  if (!node)
    return NULL;
  ctx = isl_ast_node_get_ctx(node);

  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  schedule = isl_ast_build_get_schedule(build);
  map = isl_map_reverse(isl_map_from_union_map(schedule));
  iterator_map = isl_pw_multi_aff_from_map(map);
  if (kernel)
    sched2copy = compute_sched_to_copy(kernel,
                                       isl_pw_multi_aff_copy(iterator_map));
  else
    sched2copy = NULL;

  stmt->type = AUTOSA_KERNEL_STMT_DOMAIN;
  stmt->u.d.stmt = autosa_stmt;

  data.kernel = kernel;
  data.accesses = stmt->u.d.stmt->accesses;
  data.iterator_map = iterator_map;
  data.sched2copy = sched2copy;
  stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
                                                build, &transform_index, &data,
                                                &transform_expr, &data);
  isl_pw_multi_aff_free(iterator_map);
  isl_pw_multi_aff_free(sched2copy);

  id = isl_id_alloc(ctx, "user", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* Does "array" need to be allocated on the device?
 * If it is a read-only scalar, then it will be passed as an argument
 * to the kernel and therefore does not require any allocation.
 * If this device memory is not accessed at all, then it does not
 * need to be allocated either.
 */
int autosa_array_requires_device_allocation(struct autosa_array_info *array)
{
  if (autosa_array_is_read_only_scalar(array))
    return 0;
  if (!array->global)
    return 0;
  return 1;
}

/* Build AST expressions for the device array sizes of all arrays in "prog"
 * that require allocation on the device using "build", as well as
 * for the original array sizes of all arrays that need to be declared
 * on the host.
 * "node" is freed in case of error.
 */
static __isl_give isl_ast_node *build_array_bounds(
    __isl_take isl_ast_node *node, struct autosa_prog *prog,
    __isl_keep isl_ast_build *build)
{
  int i;

  for (i = 0; i < prog->n_array; ++i)
  {
    struct autosa_array_info *array = &prog->array[i];
    isl_multi_pw_aff *size;
    isl_ast_expr *expr;

    if (!autosa_array_requires_device_allocation(array))
      continue;

    size = isl_multi_pw_aff_copy(array->bound);
    expr = ppcg_build_size_expr(size, build);
    array->bound_expr = expr;
    if (!expr)
      return isl_ast_node_free(node);
  }

  for (i = 0; i < prog->n_array; ++i)
  {
    struct autosa_array_info *array = &prog->array[i];
    isl_set *extent;
    isl_multi_pw_aff *size;
    isl_ast_expr *expr;

    if (!array->declare_local)
      continue;
    extent = isl_set_copy(array->declared_extent);
    size = ppcg_size_from_extent(extent);
    expr = ppcg_build_size_expr(size, build);
    array->declared_size = expr;
    if (!expr)
      return isl_ast_node_free(node);
  }

  return node;
}

/* This function is called for each statement node in the AST
 * for copying to or from local memory.
 * Attach a pointer to a polysa_kernel_stmt representing the copy
 * statement to the node.
 * The statement name is "read" or "write", depending on whether we are
 * reading from global memory or writing to global memory.
 *
 * The schedule is of the form
 *
 *	type[D -> A] -> L
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule, A to the global array and L to the outer
 * generated AST schedule.
 * We compute the inverse and strip off the type, resulting in
 *
 *	L -> [D -> A]
 *
 * We combine this mapping with on the one hand the projection
 *
 *	[D -> A] -> A
 *
 * and on the other hand the group tiling
 *
 *	[D -> A] -> T
 *
 * resulting in
 *
 *	L -> A		and 	L -> T
 *
 * and store the corresponding expressions in stmt->index and stmt->local_index,
 * where stmt points to the ppcg_kernel_stmt that is attached to the node.
 * stmt->index is linearized if the global memory array is linearized.
 */
static __isl_give isl_ast_node *create_access_leaf(struct autosa_kernel *kernel,
                                                   struct autosa_array_ref_group *group, __isl_take isl_ast_node *node,
                                                   __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  struct autosa_array_tile *tile;
  isl_id *id;
  isl_ast_expr *expr;
  isl_space *space;
  isl_map *access;
  isl_pw_multi_aff *pma, *pma2;
  const char *type;

  stmt = isl_calloc_type(kernel->ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  /* type[D -> A] -> L */
  access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
  type = isl_map_get_tuple_name(access, isl_dim_in);
  stmt->u.c.read = type && !strcmp(type, "read");
  /* L -> type[D -> A] */
  access = isl_map_reverse(access);
  pma = isl_pw_multi_aff_from_map(access);
  pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);
  space = isl_space_range(isl_pw_multi_aff_get_space(pma));
  space = isl_space_unwrap(space);
  /* [D -> A] -> A */
  pma2 = isl_pw_multi_aff_range_map(space);
  /* L -> A */
  pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
                                                isl_pw_multi_aff_copy(pma));
  expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
  if (group->array->linearize)
    expr = autosa_local_array_info_linearize_index(group->local_array,
                                                   expr);
  stmt->u.c.index = expr;

  tile = autosa_array_ref_group_tile(group);
  /* [D -> A] -> T */
  pma2 = isl_pw_multi_aff_from_multi_aff(
      isl_multi_aff_copy(tile->tiling));
  /* L -> T */
  pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma);
  expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
  stmt->u.c.local_index = expr;

  stmt->u.c.array = group->array;
  stmt->u.c.local_array = group->local_array;
  stmt->type = AUTOSA_KERNEL_STMT_COPY;

  id = isl_id_alloc(kernel->ctx, "copy", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* This function is called for each instance of a user statement
 * in the kernel. This may be one of the original user statements
 * or a statement introduced by AutoSA.
 *
 * We first check if the statement id corresponds to a autosa statement,
 * which indicates the statement is an original user statement. Any statement
 * that is not an original user statement has been introduced by AutoSA and
 * requires special handling.
 *
 * If the user statement is one of the original user statements, then we call
 * create_domain_leaf.  
 * If it is "init_device", then we call build_array_bounds.  
 * Otherwise, we check if it is a copy statement and call the appropriate 
 * functions.  
 * Statements that copy an array to/from the device do not need any 
 * further treatment. Neither does "clear_device".
 */
static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node,
                                          __isl_keep isl_ast_build *build, void *user)
{
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_stmt *device_stmt;
  isl_ast_expr *expr, *arg;
  isl_id *id;
  int is_sync;
  const char *name;
  void *p;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  p = isl_id_get_user(id);
  isl_ast_expr_free(expr);
  isl_ast_expr_free(arg);

  device_stmt = find_stmt(data->prog, id);
  isl_id_free(id);

  if (device_stmt)
    return create_domain_leaf(data->kernel, node, build, device_stmt);
  if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_"))
    return node;
  if (!strcmp(name, "init_device"))
    return build_array_bounds(node, data->prog, build);
  if (!strcmp(name, "clear_device"))
    return node;
  if (!strcmp(name, "drain_merge"))
    return node;
  if (!strcmp(name, "read") || !strcmp(name, "write"))
  {
    struct autosa_array_ref_group *group = (struct autosa_array_ref_group *)p;
    return create_access_leaf(data->kernel, group, node, build);
  }

  return node;
}

/* Build an access AST expression for the effective grid size using "build".
 * Store the result in kernel->grid_size_expr.
 */
static isl_stat build_grid_size(struct autosa_kernel *kernel,
                                __isl_keep isl_ast_build *build)
{
  isl_multi_pw_aff *size;

  size = isl_multi_pw_aff_copy(kernel->grid_size);
  size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid");
  kernel->grid_size_expr = ppcg_build_size_expr(size, build);

  if (!kernel->grid_size_expr)
    return isl_stat_error;
  return isl_stat_ok;
}

/* Build access AST expressions for the localized array sizes using "build".
 * Store the result in local->bound_expr.
 * Only do this for arrays for which localized bounds have been computed.
 */
static isl_stat build_local_array_sizes(struct autosa_kernel *kernel,
                                        __isl_keep isl_ast_build *build)
{
  int i;

  for (i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    isl_multi_pw_aff *size;

    if (local->n_group == 0)
      continue;
    size = isl_multi_pw_aff_copy(local->bound);
    local->bound_expr = ppcg_build_size_expr(size, build);
    if (!local->bound_expr)
      return isl_stat_error;
  }

  return isl_stat_ok;
}

/* Build access AST expressions for the effective grid size and
 * the localized array sizes using "build".
 */
static isl_stat build_grid_and_local_array_sizes(struct autosa_kernel *kernel,
                                                 __isl_keep isl_ast_build *build)
{
  if (build_grid_size(kernel, build) < 0)
    return isl_stat_error;
  if (build_local_array_sizes(kernel, build) < 0)
    return isl_stat_error;
  return isl_stat_ok;
}

/* This function is called before the AST generator starts traversing
 * the schedule subtree of a node with mark "mark".
 *
 * If the mark is called "kernel", store the kernel pointer in data->kernel
 * for use in at_domain and build AST expressions for the grid size and
 * the localized array sizes.
 */
static isl_stat before_mark(__isl_keep isl_id *mark,
                            __isl_keep isl_ast_build *build, void *user)
{
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;

  if (!mark)
    return isl_stat_error;
  if (!strcmp(isl_id_get_name(mark), "kernel"))
  {
    data->kernel = (struct autosa_kernel *)isl_id_get_user(mark);
    if (build_grid_and_local_array_sizes(data->kernel, build) < 0)
      return isl_stat_error;
  }
  return isl_stat_ok;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node. "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "kernel", then replace "node" by a user node
 * that "calls" the kernel, representing the launch of the kernel.
 * The original "node" is stored inside the kernel object so that
 * it can be used to print the device code.
 * Note that this assumes that a kernel is only launched once.
 * Also clear data->kernel.
 */
static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node,
                                           __isl_keep isl_ast_build *build, void *user)
{
  isl_ctx *ctx;
  isl_id *id;
  isl_ast_expr *expr;
  isl_ast_expr_list *list;
  struct autosa_kernel *kernel;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;

  ctx = isl_ast_node_get_ctx(node);
  id = isl_ast_node_mark_get_id(node);
  if (!id)
    return isl_ast_node_free(node);
  if (strcmp(isl_id_get_name(id), "kernel") || !data->kernel)
  {
    isl_id_free(id);
    return node;
  }
  kernel = data->kernel;
  data->kernel = NULL;
  kernel->space = isl_ast_build_get_schedule_space(build);
  kernel->tree = isl_ast_node_mark_get_node(node);
  isl_ast_node_free(node);
  expr = isl_ast_expr_from_id(isl_id_copy(id));
  list = isl_ast_expr_list_alloc(ctx, 0);
  expr = isl_ast_expr_call(expr, list);
  node = isl_ast_node_alloc_user(expr);
  node = isl_ast_node_set_annotation(node, id);

  return node;
}

/* Use isl to generate code for both the host and the device
 * from "schedule".
 * The device code is marked by "kernel" mark nodes in the schedule tree,
 * containing a pointer to a polysa_kernel object.
 * The returned AST only contains the AST for the host code.
 * The ASTs for the device code are embedded in polysa_kernel objects
 * attached to the leaf nodes that call "kernel".
 */
__isl_give isl_ast_node *sa_generate_code(struct autosa_gen *gen,
                                          __isl_take isl_schedule *schedule)
{
  struct autosa_at_domain_data data;
  isl_ast_build *build;
  isl_ast_node *tree;
  isl_id_list *iterators;
  int depth;

  if (schedule == NULL)
    return NULL;

  data.prog = gen->prog;
  data.kernel = NULL;

  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
                                                  &depth) < 0)
    schedule = isl_schedule_free(schedule);
  build = isl_ast_build_alloc(gen->prog->ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_domain, &data);
  build = isl_ast_build_set_before_each_mark(build, &before_mark, &data);
  build = isl_ast_build_set_after_each_mark(build, &after_mark, &data);
  if (gen->prog->scop->options->debug->dump_final_schedule)
    isl_schedule_dump(schedule);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  return tree;
}

/* Initialize the autosa_at_domain_data struct. */
static void autosa_at_domain_data_init(
    struct autosa_at_domain_data *data, struct autosa_gen *gen)
{
  data->prog = gen->prog;
  data->kernel = NULL;
  data->module = NULL;
  data->filter_buffer = 0;
  data->under_unroll = 0;
  data->under_pipeline = 0;
  data->in_unroll_for = 0;
  data->in_pipeline_for = 0;
  data->in_for = 0;
  data->boundary = 0;
  data->pe_dummy = 0;
  data->pe_dummy_module = NULL;
  data->drain_merge_func = NULL;
  data->tuning = 0;
  data->tuning_num = 0;
}

/* Return a pointer to the autosa_array_ref_group in "local"
 * that contains the reference "access".
 * Return NULL if no such group can be found.
 */
static struct autosa_array_ref_group *find_ref_group_module(
    struct autosa_local_array_info *local, struct autosa_stmt_access *access)
{
  int i, j;

  for (i = 0; i < local->n_pe_group; ++i)
  {
    struct autosa_array_ref_group *group = local->pe_groups[i];

    for (j = 0; j < group->n_ref; ++j)
      if (group->refs[j] == access)
        return group;
  }

  return NULL;
}

/* Index transformation callback for pet_stmt_build_ast_exprs.
 *
 * "index" expresses the array indices in terms of statement iterators
 *
 * We first reformulate "index" in terms of the AST loop iterators.
 * Then we check if we are accessing the global array or
 * a shared/private copy.  In particular, if we are not inside a kernel
 * then we must be accessing a global array.
 * In the former case, we simply return
 * the updated index.  If "index" is an affine expression rather
 * than an array access, then we also return the updated index here.
 *
 * If no reference groups have been computed for the array,
 * then we can only be accessing the global array.
 *
 * Otherwise, we apply the tiling to the index.
 * This tiling is of the form
 *
 *	[D -> A] -> T
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule.
 * The index is of the form
 *
 *	L -> A
 *
 * We update the tiling to refer to the AST loop iterators
 *
 *	[L -> A] -> T
 *
 * and combine it with the index to obtain a tiled index expression in terms
 * of the AST loop iterators
 *
 *	L -> T
 *
 * Note that while the tiling applies directly to an outer array.
 * the index may refer to some subfield of this outer array.
 * In such cases, the result will refer to the same subfield of the tile.
 * That is, an index expression of the form  L -> F(A) will be transformed
 * into an index expression of the form L -> F(T).
 */
static __isl_give isl_multi_pw_aff *transform_index_module(
    __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
    void *user)
{
  struct autosa_transform_data *data = (struct autosa_transform_data *)user;
  struct autosa_stmt_access *access;
  struct autosa_array_ref_group *group;
  struct autosa_array_tile *tile;
  isl_pw_multi_aff *iterator_map;
  int i;
  int dim;
  const char *name;
  isl_space *space;
  isl_multi_pw_aff *tiling;
  isl_pw_multi_aff *pma;
  isl_pw_multi_aff *sched2depth;
  isl_pw_multi_aff *sched2copy;

  data->array = NULL;

  iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
  index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);

  if (!data->kernel)
    return index;

  access = find_access(data->accesses, ref_id);
  if (!access)
    return index;
  if (!isl_map_has_tuple_name(access->access, isl_dim_out))
    return index;

  name = get_outer_array_name(access->access);
  if (!name)
    return isl_multi_pw_aff_free(index);
  i = find_array_index(data->kernel, name);
  if (i < 0)
    isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
            "cannot find array",
            return isl_multi_pw_aff_free(index));
  data->local_array = &data->kernel->array[i];
  data->array = data->local_array->array;

  group = find_ref_group_module(data->local_array, access);
  data->group = group;
  if (!group)
  {
    data->global = 1;
    data->reg = 1;
    return index;
  }

  tile = autosa_array_ref_group_tile(group);
  data->global = !tile;
  data->reg = !tile;
  if (!tile)
    return index;

  /* recompute the sched2copy for each index. */
  if (group->group_type == AUTOSA_PE_GROUP)
  {    
    sched2copy = compute_sched_to_copy_group(
        isl_pw_multi_aff_copy(data->iterator_map), group);    
  }

  space = isl_space_domain(isl_multi_aff_get_space(tile->tiling));
  space = isl_space_range(isl_space_unwrap(space));
  space = isl_space_map_from_set(space);
  pma = isl_pw_multi_aff_identity(space);
  if (group->group_type == AUTOSA_PE_GROUP)
  {
    sched2depth = sched2copy;
  }
  else
  {
    sched2depth = isl_pw_multi_aff_copy(data->sched2copy);
  }
  dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out);
  sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out,
                                           tile->depth, dim - tile->depth);
  pma = isl_pw_multi_aff_product(sched2depth, pma);
  tiling = isl_multi_pw_aff_from_multi_aff(
      isl_multi_aff_copy(tile->tiling));
  tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
  index = tile_outer(index, tiling);

  return index;
}

/* AST expression transformation callback for pet_stmt_build_ast_exprs.
 *
 * If the AST expression refers to an array that is not accessed
 * at all, then this means the value of the expression is not used,
 * so we might as well print zero (NULL pointer) instead.
 *
 * If the AST expression refers to a global scalar that is not
 * a read-only scalar, then its address was passed to the kernel and
 * we need to dereference it.
 *
 * If the AST expression refers to an array reference that is put in 
 * the registers. We will modify the expr to a register access.
 *
 * If the AST expression refers to an access to a global array,
 * then we linearize the access exploiting the bounds in data->local_array.
 */
static __isl_give isl_ast_expr *transform_expr_module(__isl_take isl_ast_expr *expr,
                                                      __isl_keep isl_id *id, void *user)
{
  struct autosa_transform_data *data = (struct autosa_transform_data *)user;

  if (!data->array)
    return expr;

  if (!data->array->accessed)
  {
    isl_ctx *ctx;

    ctx = isl_ast_expr_get_ctx(expr);
    isl_ast_expr_free(expr);
    return isl_ast_expr_from_val(isl_val_zero(ctx));
  }
  if (autosa_array_is_read_only_scalar(data->array))
    return expr;
  if (!data->reg)
    return expr;
  if (data->reg)
  {
    isl_ctx *ctx;
    char *local_name;
    char buf[50];
    isl_id *id;
    isl_ast_expr *array;
    isl_ast_expr_list *indices;
    isl_ast_expr *indice;

    ctx = isl_ast_expr_get_ctx(expr);
    isl_ast_expr_free(expr);

    /* Create a register access. */
    isl_printer *p_str = isl_printer_to_str(ctx);
    p_str = autosa_array_ref_group_print_name(data->group, p_str);
    local_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    sprintf(buf, "%s", local_name);
    free(local_name);

    id = isl_id_alloc(ctx, buf, NULL);
    array = isl_ast_expr_from_id(id);
    indice = isl_ast_expr_from_val(isl_val_zero(ctx));
    indices = isl_ast_expr_list_from_ast_expr(indice);
    expr = isl_ast_expr_access(array, indices);

    return expr;
  }
  if (data->array->n_index == 0)
    return dereference(expr);
  if (!data->array->linearize)
    return expr;

  return autosa_local_array_info_linearize_index(data->local_array, expr);
}

/* This function is called for each instance of a user statement
 * in the kernel "kernel", identified by "autosa_stmt".
 * "kernel" may be NULL if we are not inside a kernel.
 *
 * We attach a struct autosa_kernel_stmt to the "node", containing
 * a computed AST expression for each access, through an annotation
 * with name "user".
 * These AST expressions are computed from iterator_map,
 * which expresses the domain
 * elements in terms of the generated loops, and sched2copy,
 * which expresses the outer copy_schedule_dim dimensions of
 * the kernel schedule computed by PPCG in terms of the generated loops.
 */
static __isl_give isl_ast_node *create_domain_leaf_module(
    struct autosa_kernel *kernel, __isl_take isl_ast_node *node,
    __isl_keep isl_ast_build *build, struct autosa_stmt *autosa_stmt)
{
  struct autosa_transform_data data;
  struct autosa_kernel_stmt *stmt;
  isl_ctx *ctx;
  isl_id *id;
  isl_pw_multi_aff *sched2copy;
  isl_map *map;
  isl_pw_multi_aff *iterator_map;
  isl_union_map *schedule;

  if (!node)
    return NULL;
  ctx = isl_ast_node_get_ctx(node);

  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  schedule = isl_ast_build_get_schedule(build);
  map = isl_map_reverse(isl_map_from_union_map(schedule));
  iterator_map = isl_pw_multi_aff_from_map(map);
  if (kernel)
    sched2copy = compute_sched_to_copy(kernel,
                                       isl_pw_multi_aff_copy(iterator_map));
  else
    sched2copy = NULL;

  stmt->type = AUTOSA_KERNEL_STMT_DOMAIN;
  stmt->u.d.stmt = autosa_stmt;

  data.kernel = kernel;
  data.accesses = stmt->u.d.stmt->accesses;
  data.iterator_map = iterator_map;
  data.sched2copy = sched2copy;
  stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
                                                build, &transform_index_module, &data,
                                                &transform_expr_module, &data);

  isl_pw_multi_aff_free(iterator_map);
  isl_pw_multi_aff_free(sched2copy);

  id = isl_id_alloc(ctx, "user", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* This function extracts the reduce op in the stmt name, which is in the format of:
 * in/out_trans_reduce_[op]
 */
static char *extract_io_stmt_reduce_op(
  isl_ctx *ctx, const char *type)
{
  isl_printer *p_str;
  char *op;
  int loc = 0;
  char ch;
  int underscore_cnt = 0;

  p_str = isl_printer_to_str(ctx);  
  while ((ch = type[loc]) != '\0')
  {
    if (ch == '.')
      break;
    if (ch == '_')
      underscore_cnt++;
    else if (underscore_cnt == 3) {
      char buf[2];
      buf[0] = ch;
      buf[1] = '\0';
      p_str = isl_printer_print_str(p_str, buf);      
    }
    loc++;
  }

  op = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return op;
}

/* AutoSA stmt is in the format of
 * [].[].[]
 * This function extracts the integer field at the pos-th position.
 * If the position is not found, -1 is returned.
 */
static int extract_autosa_stmt_int_field(
  isl_ctx *ctx, const char *type, int pos) 
{
  int loc = 0;
  char ch;
  int dot_time = 0;
  isl_printer *p_str;
  char *comp_str;
  int ret;

  while ((ch = type[loc]) != '\0')
  {
    if (ch == '.')
      dot_time++;
    if (dot_time == pos)
      break;
    loc++;
  }

  if (ch == '\0') {
    //std::string stmt(type);
    //std::string info = "[AutoSA] Error: Wrong pos: " + std::to_string(pos) + 
    //  " in stmt: " + stmt;
    //throw std::runtime_error(info);
    return -1;
  }

  p_str = isl_printer_to_str(ctx);
  loc++;
  while (((ch = type[loc]) != '\0') && ((ch = type[loc]) != '.'))
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  comp_str = isl_printer_get_str(p_str);
  ret = atoi(comp_str);
  free(comp_str);
  isl_printer_free(p_str);

  return ret;
}

/* AutoSA stmt is in the format of
 * [].[].[]
 * This function extracts the string field at the pos-th position.
 * If the position is not found, NULL is returned.
 */
static __isl_give char *extract_autosa_stmt_str_field(
  isl_ctx *ctx, const char *type, int pos) 
{
  int loc = 0;
  char ch;
  int dot_time = 0;
  isl_printer *p_str;
  char *comp_str;  

  while ((ch = type[loc]) != '\0')
  {
    if (ch == '.')
      dot_time++;
    if (dot_time == pos)
      break;
    loc++;
  }

  if (ch == '\0') {    
    return NULL;
  }

  p_str = isl_printer_to_str(ctx);
  loc++;
  while (((ch = type[loc]) != '\0') && ((ch = type[loc]) != '.'))
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  comp_str = isl_printer_get_str(p_str);  
  isl_printer_free(p_str);

  return comp_str;
}

static __isl_give isl_ast_node *create_serialize_leaf(struct autosa_kernel *kernel,
                                                      struct autosa_array_ref_group_pair *pair,
                                                      __isl_take isl_ast_node *node,
                                                      const char *name,
                                                      __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  struct autosa_array_ref_group *group;
  isl_ctx *ctx;
  isl_map *access;
  isl_set *set;
  isl_pw_multi_aff *pma, *pma2;
  isl_space *space;
  isl_ast_expr *expr;
  isl_id *id;

  stmt = isl_calloc_type(kernel->ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);
  stmt->type = AUTOSA_KERNEL_STMT_HOST_SERIALIZE;
  ctx = kernel->ctx;
  group = pair->local_group;

  /* Compute the global index. */
  /* type[D -> A] -> L */
  access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
  /* L -> type[D -> A] */
  access = isl_map_reverse(access);
  pma = isl_pw_multi_aff_from_map(access);
  pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);
  space = isl_space_range(isl_pw_multi_aff_get_space(pma));
  space = isl_space_unwrap(space);
  /* [D -> A] -> A */
  pma2 = isl_pw_multi_aff_range_map(space);
  /* L -> A */
  pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
                                                pma);
  expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
  expr = autosa_local_array_info_linearize_index(group->local_array, expr);

  stmt->u.s.index = expr;
  stmt->u.s.in = !prefixcmp(name, "serialize") ? 1 : 0;
  stmt->u.s.group = pair->io_group;

  id = isl_id_alloc(kernel->ctx, "serialize", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* This function is called for each statement node in the AST
 * for transferring through fifos.
 * Attach a pointer to an autosa_kernel_stmt representing the io
 * statemet to the node.
 * The statement name is "in" or "out", depending on whether we are 
 * transferring in or out via fifos.
 *
 * The schedule is of the form
 *
 *  type[D -> A] -> L
 *
 * where D corresponds to the outer tile->depth dimensions of 
 * the kernel schedule, A to the global array and L to the outer 
 * generated AST schedule.
 * We compute the inverse and strip off the type, resulting in
 *
 *  L -> [D -> A]
 *
 * We combine this mapping with the group tiling
 *
 *  [D -> A] -> T
 *
 * resulting in
 *   
 *  L -> T
 *
 * and store the corresponding expressions in stmt->local_index,
 * where stmt points to the autosa_kernel_stmt that is attached to the node.
 */
static __isl_give isl_ast_node *create_io_leaf(struct autosa_kernel *kernel,
                                               struct autosa_hw_module *module,
                                               struct autosa_array_ref_group_pair *pair,
                                               __isl_take isl_ast_node *node,
                                               __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  struct autosa_array_tile *tile;
  isl_multi_aff *new_tiling;
  isl_map *access;
  const char *type;
  isl_pw_multi_aff *pma, *pma2;
  isl_space *space;
  isl_ast_expr *expr;
  isl_id *id;
  int is_trans;        // i/o transfer statement between on-chip modules
  int is_trans_dram;   // i/o transfer statement between dram and on-chip modules
  int is_trans_lower;  // i/o transfer statement with lower transfer
  int is_trans_buf;    // i/o transfer statement with local buffers
  int is_trans_boundary;
  int is_trans_reduce;
  int is_dummy;
  int is_dummy_reduce;
  int is_serialize; // is dram access to be serialized
  struct autosa_array_ref_group *group = pair->local_group;
  int depth;
  isl_ctx *ctx;

  stmt = isl_calloc_type(kernel->ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);
  ctx = kernel->ctx;

  /* type[D -> A] -> L */
  access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
  isl_set *set = isl_map_domain(isl_set_unwrap(isl_map_domain(isl_map_copy(access))));
  depth = isl_set_dim(set, isl_dim_set);
  isl_set_free(set);

  type = isl_map_get_tuple_name(access, isl_dim_in);  
  /* The format of io_trans stmt name:
   * in/out_trans[_dram]/[_dram_serialize]/[_boundary]/[_reduce_(reduce_op)].[in_fifo_name].[out_fifo_name].[is_buffer].
   * [cur_pack_lane].[nxt_pack_lane].[coalesce_depth].[coalesce_bound]
   * or 
   * in/out[_dummy][_reduce].[fifo_name].[cur_pack_lane].[nxt_pack_lane]
   */

  /* Classify the io stmt type. */
  is_trans = !prefixcmp(type, "in_trans") || !prefixcmp(type, "out_trans");
  is_trans_dram = !prefixcmp(type, "in_trans_dram") || !prefixcmp(type, "out_trans_dram");
  is_trans_boundary = !prefixcmp(type, "in_trans_boundary") || !prefixcmp(type, "out_trans_boundary");
  is_trans_reduce = !prefixcmp(type, "in_trans_reduce") || !prefixcmp(type, "out_trans_reduce");
  if (is_trans)
  {    
    is_trans_buf = extract_autosa_stmt_int_field(ctx, type, 3);    
  }
  if (!is_trans)
  {
    is_dummy = !prefixcmp(type, "in_dummy") || !prefixcmp(type, "out_dummy");
  }
  else
  {
    is_dummy = 0;
  }
  if (is_dummy) {
    is_dummy_reduce = !prefixcmp(type, "in_dummy_reduce") || !prefixcmp(type, "out_dummy_reduce");
  } else {
    is_dummy_reduce = 0;
  }  
  if (is_trans_dram)
  {    
    is_serialize = !prefixcmp(type, "in_trans_dram_serialize") || !prefixcmp(type, "out_trans_dram_serialize");    
  } else {
    is_serialize = 0;
  }
  
  stmt->u.i.simd_depth = pair->simd_depth;
  stmt->u.i.dummy = is_dummy;
  stmt->u.i.in = type && !prefixcmp(type, "in");
  stmt->u.i.buf = is_trans_buf;    
  stmt->u.i.serialize = is_serialize;  
  if (is_trans) {
    stmt->u.i.data_pack = extract_autosa_stmt_int_field(ctx, type, 4);
    stmt->u.i.nxt_data_pack = extract_autosa_stmt_int_field(ctx, type, 5);
    stmt->u.i.coalesce_depth = extract_autosa_stmt_int_field(ctx, type, 6);
    stmt->u.i.coalesce_bound = extract_autosa_stmt_int_field(ctx, type, 7);
    stmt->u.i.if_depth = extract_autosa_stmt_int_field(ctx, type, 8);    
  } else {
    stmt->u.i.data_pack = extract_autosa_stmt_int_field(ctx, type, 2);
    stmt->u.i.nxt_data_pack = extract_autosa_stmt_int_field(ctx, type, 3);
    stmt->u.i.coalesce_depth = -1;
    stmt->u.i.coalesce_bound = -1;    
  }
  if (is_trans_reduce) {
    stmt->u.i.reduce = 1;
    stmt->u.i.reduce_op = extract_io_stmt_reduce_op(ctx, type);
  } else {
    stmt->u.i.reduce = is_dummy_reduce;
    stmt->u.i.reduce_op = NULL;
  }

  /* Compute the global index. */
  /* L -> type[D -> A] */
  access = isl_map_reverse(access);
  pma = isl_pw_multi_aff_from_map(access);
  pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);

  space = isl_space_range(isl_pw_multi_aff_get_space(pma));
  space = isl_space_unwrap(space);
  /* [D -> A] -> A */
  pma2 = isl_pw_multi_aff_range_map(space);
  /* L -> A */
  pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
                                                isl_pw_multi_aff_copy(pma));
  expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
  if (group->array->linearize)
  {
    expr = autosa_local_array_info_linearize_index(group->local_array,
                                                   expr);

    if (stmt->u.i.data_pack > 1)
    {
      /* Update the last dimension,
       * divide it by the data packing factor.
       */
      isl_ast_expr *arg, *div;
      arg = isl_ast_expr_get_op_arg(expr, 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(kernel->ctx, stmt->u.i.data_pack));
      arg = isl_ast_expr_div(arg, div);
      expr = isl_ast_expr_set_op_arg(expr, 1, arg);
    }
  }
  else
  {
    if (stmt->u.i.data_pack > 1)
    {
      /* Update the last dimension,
       * divide it by the data packing factor.
       */
      int n_arg;
      isl_ast_expr *arg, *div;
      n_arg = isl_ast_expr_get_op_n_arg(expr);
      arg = isl_ast_expr_get_op_arg(expr, n_arg - 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(kernel->ctx, stmt->u.i.data_pack));
      arg = isl_ast_expr_div(arg, div);
      expr = isl_ast_expr_set_op_arg(expr, n_arg - 1, arg);
    }
  }

  stmt->u.i.index = expr;

  /* Compute the local index. */
  tile = pair->local_tile;
  if (tile)
  {
    isl_ast_expr *arg, *div;
    int n_arg;

    /* [D -> A] -> T */
    pma2 = isl_pw_multi_aff_from_multi_aff(
        isl_multi_aff_copy(tile->tiling));
    if (tile->depth < depth)
    {
      /* Extend the D dimension to depth in pma2. */
      new_tiling = autosa_array_ref_group_recompute_tiling(tile, group, depth);
      isl_pw_multi_aff_free(pma2);
      pma2 = isl_pw_multi_aff_from_multi_aff(new_tiling);
    }

    /* L -> T */
    pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma);
    expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
    stmt->u.i.local_index = expr;
    stmt->u.i.reg = 0;
  }
  else
  {
    /* Create a scalar expr. */
    isl_printer *p_str;
    char *local_name;
    char buf[50];
    isl_ast_expr *array, *indice;
    isl_ast_expr_list *indices;

    isl_pw_multi_aff_free(pma);
    p_str = isl_printer_to_str(kernel->ctx);
    p_str = autosa_array_ref_group_print_name(group, p_str);
    local_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);        
    sprintf(buf, "%s", local_name);    
    free(local_name);    

    id = isl_id_alloc(kernel->ctx, buf, NULL);
    array = isl_ast_expr_from_id(id);
    indice = isl_ast_expr_from_val(isl_val_zero(kernel->ctx));
    indices = isl_ast_expr_list_from_ast_expr(indice);
    expr = isl_ast_expr_access(array, indices);
    stmt->u.i.local_index = expr;
    stmt->u.i.reg = 1;
  }

  if (is_trans) {
    stmt->u.i.in_fifo_name = extract_autosa_stmt_str_field(ctx, type, 1);
    stmt->u.i.out_fifo_name = extract_autosa_stmt_str_field(ctx, type, 2);
  } else {
    stmt->u.i.in_fifo_name = extract_autosa_stmt_str_field(ctx, type, 1);
    stmt->u.i.out_fifo_name = extract_autosa_stmt_str_field(ctx, type, 1);
  }
  
  stmt->u.i.group = pair->io_group;
  stmt->u.i.module = module;
  stmt->u.i.array = group->array;
  stmt->u.i.local_array = group->local_array;
  if (is_trans)
  {
    if (is_trans_dram)
    {
      stmt->type = AUTOSA_KERNEL_STMT_IO_DRAM;
    }
    else
    {
      stmt->type = AUTOSA_KERNEL_STMT_IO_TRANSFER;      
      stmt->u.i.filter_sched_depth = -1;
      stmt->u.i.filter_param_id = -1;
      if (is_trans_boundary)
      {
        stmt->u.i.boundary = 1;
      }
      else
      {
        stmt->u.i.boundary = 0;
      }
    }
  }
  else
  {
    stmt->type = AUTOSA_KERNEL_STMT_IO;
  }

  id = isl_id_alloc(kernel->ctx, "io", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

static __isl_give isl_ast_node *create_drain_merge_leaf(struct autosa_kernel *kernel,
                                                        struct autosa_drain_merge_func *func, __isl_take isl_ast_node *node,
                                                        __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  struct autosa_array_ref_group *group;
  isl_ctx *ctx;
  isl_map *access;
  isl_pw_multi_aff *pma, *pma2;
  isl_space *space;
  isl_ast_expr *expr;
  isl_id *id;

  stmt = isl_calloc_type(kernel->ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);
  ctx = kernel->ctx;
  stmt->type = AUTOSA_KERNEL_STMT_DRAIN_MERGE;
  stmt->u.dm.func = func;

  /* Compute the global index. */
  /* type[D -> A] -> L */
  access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
  /* L -> type[D -> A] */
  access = isl_map_reverse(access);
  pma = isl_pw_multi_aff_from_map(access);
  pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);
  space = isl_space_range(isl_pw_multi_aff_get_space(pma));
  space = isl_space_unwrap(space);
  /* [D -> A] -> A */
  pma2 = isl_pw_multi_aff_range_map(space);
  /* L -> A */
  pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
                                                isl_pw_multi_aff_copy(pma));
  expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
  isl_pw_multi_aff_free(pma);

  /* Linearize the index. */
  group = func->group;
  expr = autosa_local_array_info_linearize_index(group->local_array, expr);
  stmt->u.dm.index = expr;

  id = isl_id_alloc(ctx, "drain_merge", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

///* Exatract the boundary field from the module call type, which is in the format of:
// * io_module.[].boundary
// * or 
// * module_call.module_name.boundary
// * */
//static int extract_is_boundary(isl_ctx *ctx, const char *type)
//{
//  int ret_val;
//  char *boundary = extract_io_stmt_str_field(ctx, type, 2);
//  if (boundary && !strcmp(boundary, "boundary")) {
//    ret_val = 1;
//  } else {
//    ret_val = 0;
//  }
//  free(boundary);
//  return ret_val;
//}

/* Extract the module_name field from the module call type, which is in the format of:
 * module_call.module_name.boundary 
 */
static char *extract_module_name(isl_ctx *ctx, const char *type)
{
  char ch;
  int loc = 0;
  int n_dot = 0;
  isl_printer *p_str;
  char *module_name;

  while ((ch = type[loc]) != '\0')
  {
    if (ch == '.')
      n_dot++;
    if (n_dot == 1)
      break;
    loc++;
  }

  loc++;
  p_str = isl_printer_to_str(ctx);
  while ((ch = type[loc]) != '\0')
  {
    if (ch == '.')
      break;
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  module_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return module_name;
}

/* There are two types of module call statements:
 * module_call_upper and module_call_lower
 * For module_call_lower, if the module is connected to PEs,
 * we will calculate the AST expression io_pe_expr which is the 
 * PE indices described by IO ids.
 */
static __isl_give isl_ast_node *create_ext_module_leaf(
    struct autosa_kernel *kernel,
    __isl_take isl_ast_node *node, struct autosa_hw_module *module,
    struct autosa_pe_dummy_module *pe_dummy_module,
    struct autosa_array_ref_group *group, const char *name,
    __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  isl_id *id;
  isl_ctx *ctx;
  isl_multi_aff *trans;
  isl_map *map;
  isl_pw_multi_aff *pma;
  isl_ast_expr *expr;

  ctx = isl_ast_node_get_ctx(node);
  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  stmt->type = AUTOSA_KERNEL_STMT_EXT_MODULE;
  stmt->u.m.module = module;
  stmt->u.m.group = group;
  /* module_lower/upper.module_name.[is_boundary].[is_serialize] */
  stmt->u.m.boundary = extract_autosa_stmt_int_field(ctx, name, 2);  
  stmt->u.m.module_name = extract_autosa_stmt_str_field(ctx, name, 1);
  //stmt->u.m.dummy = !suffixcmp(stmt->u.m.module_name, "dummy");
  if (!suffixcmp(stmt->u.m.module_name, "dummy_in") || !suffixcmp(stmt->u.m.module_name, "dummy_out"))
    stmt->u.m.dummy = 1;
  else
    stmt->u.m.dummy = 0;
  stmt->u.m.pe_dummy_module = pe_dummy_module;
  if (!prefixcmp(name, "ext_module_lower"))
  {
    stmt->u.m.lower = 1;
    stmt->u.m.upper = 0;
  }
  else if (!prefixcmp(name, "ext_module_upper"))
  {
    stmt->u.m.lower = 0;
    stmt->u.m.upper = 1;
  }
  else
  {
    stmt->u.m.lower = 0;
    stmt->u.m.upper = 0;
  }

  id = isl_id_alloc(ctx, "ext_module", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* There are two types of module call statements:
 * module_call_upper and module_call_lower
 * For module_call_lower, if the module is connected to PEs,
 * we will calculate the AST expression io_pe_expr which is the 
 * PE indices described by IO ids.
 */
static __isl_give isl_ast_node *create_module_call_leaf(
    struct autosa_kernel *kernel,
    __isl_take isl_ast_node *node, struct autosa_hw_module *module,
    struct autosa_pe_dummy_module *pe_dummy_module,
    struct autosa_array_ref_group *group, const char *name,
    __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  isl_id *id;
  isl_ctx *ctx;
  isl_multi_aff *trans;
  isl_map *map;
  isl_pw_multi_aff *pma;
  isl_ast_expr *expr;

  ctx = isl_ast_node_get_ctx(node);
  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

//#ifdef _DEBUG
//  if (!strcmp(module->name, "U_drain_IO_L2_out")) {
//    isl_union_map *sched_tmp;
//    sched_tmp = isl_ast_build_get_schedule(build);
//    DBGUMAP(stdout, sched_tmp, kernel->ctx);
//    isl_space *space_tmp;
//    space_tmp = isl_ast_build_get_schedule_space(build);
//    DBGSPACE(stdout, space_tmp, kernel->ctx);
//  }
//#endif

  stmt->type = AUTOSA_KERNEL_STMT_MODULE_CALL;
  stmt->u.m.module = module;
  stmt->u.m.group = group;
  /* module_call_lower/upper.module_name.[is_boundary].[is_serialize].[lower_sched_val] */
  stmt->u.m.boundary = extract_autosa_stmt_int_field(ctx, name, 2);
  stmt->u.m.module_name = extract_autosa_stmt_str_field(ctx, name, 1);
  //stmt->u.m.dummy = !suffixcmp(stmt->u.m.module_name, "dummy");  
  if (!suffixcmp(stmt->u.m.module_name, "dummy_in") || !suffixcmp(stmt->u.m.module_name, "dummy_out"))
    stmt->u.m.dummy = 1;
  else
    stmt->u.m.dummy = 0;
  stmt->u.m.pe_dummy_module = pe_dummy_module;
  stmt->u.m.serialize = extract_autosa_stmt_int_field(ctx, name, 3);
  stmt->u.m.lower_sched_val = extract_autosa_stmt_int_field(ctx, name, 4);  
//#ifdef _DEBUG
//  if (!strcmp(stmt->u.m.module_name, "U_tmp_1_PE_dummy_in"))
//    printf("debug here\n");
//#endif

  if (!prefixcmp(name, "module_call_lower"))
  {
    stmt->u.m.lower = 1;
    stmt->u.m.upper = 0;
  }
  else if (!prefixcmp(name, "module_call_upper"))
  {
    stmt->u.m.lower = 0;
    stmt->u.m.upper = 1;
  }
  else
  {
    stmt->u.m.lower = 0;
    stmt->u.m.upper = 0;
  }

  if (stmt->u.m.lower)
  {
    if (!stmt->u.m.boundary)
    {
      if ((module->type == IO_MODULE || module->type == DRAIN_MODULE) && !group->io_pe_expr)
      {
        if (module->to_pe)
        {
          isl_union_map *umap = isl_ast_build_get_schedule(build);
          isl_union_set *uset = isl_union_map_range(umap);
          isl_set *set = isl_set_from_union_set(uset);
          isl_map *map = isl_set_identity(set);
          map = isl_map_flatten_range(map);
          trans = isl_multi_aff_copy(group->io_trans);
          isl_map *map2 = isl_map_from_multi_aff(trans);
          map2 = isl_map_reverse(map2);
          map = isl_map_apply_range(map, map2);
          isl_pw_multi_aff *pma = isl_pw_multi_aff_from_map(map);
          expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
          group->io_pe_expr = expr;
        }
      }
    }
    /* boundary module */
    if (stmt->u.m.boundary)
    {
      if ((module->type == IO_MODULE || module->type == DRAIN_MODULE) && !group->io_pe_expr_boundary)
      {
        if (module->to_pe)
        {
          isl_union_map *umap = isl_ast_build_get_schedule(build);
          isl_union_set *uset = isl_union_map_range(umap);
          isl_set *set = isl_set_from_union_set(uset);
          isl_map *map = isl_set_identity(set);
          map = isl_map_flatten_range(map);
          trans = isl_multi_aff_copy(group->io_trans);
          isl_map *map2 = isl_map_from_multi_aff(trans);
          map2 = isl_map_reverse(map2);
          map = isl_map_apply_range(map, map2);
          isl_pw_multi_aff *pma = isl_pw_multi_aff_from_map(map);
          expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
          group->io_pe_expr_boundary = expr;
        }
      }
    }
  }

  id = isl_id_alloc(ctx, "module_call", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* For fifo decleration statements, we will compute the AST expressions of 
 * PE indices that are described by the IO ids if the fifo is connected to 
 * PEs.
 */
static __isl_give isl_ast_node *create_fifo_decl_leaf(
    struct autosa_kernel *kernel,
    __isl_take isl_ast_node *node, struct autosa_hw_module *module,
    struct autosa_array_ref_group *group, const char *name,
    __isl_keep isl_ast_build *build)
{
  struct autosa_kernel_stmt *stmt;
  isl_id *id;
  isl_ctx *ctx;
  isl_multi_aff *trans;
  isl_map *map;
  isl_pw_multi_aff *pma;
  isl_ast_expr *expr;

  ctx = isl_ast_node_get_ctx(node);
  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  /* Generate the AST expr of io_trans. */
  if (module->type == PE_MODULE && !group->io_L1_pe_expr)
  {
    isl_union_map *umap = isl_ast_build_get_schedule(build);
    isl_union_set *uset = isl_union_map_range(umap);
    isl_set *set = isl_set_from_union_set(uset);
    isl_map *map = isl_set_identity(set);
    map = isl_map_flatten_range(map);
    trans = group->io_L1_trans;
    isl_map *map2 = isl_map_from_multi_aff(isl_multi_aff_copy(trans));
    map2 = isl_map_reverse(map2);
    map = isl_map_apply_range(map, map2);
    isl_pw_multi_aff *pma = isl_pw_multi_aff_from_map(map);
    expr = isl_ast_build_access_from_pw_multi_aff(build, pma);
    group->io_L1_pe_expr = expr;
  }

  stmt->type = AUTOSA_KERNEL_STMT_FIFO_DECL;
  stmt->u.m.module = module;
  stmt->u.m.group = group;
  if (!prefixcmp(name, "fifo_decl_boundary"))
    stmt->u.m.boundary = 1;
  else
    stmt->u.m.boundary = 0;
  id = isl_id_alloc(ctx, "fifo_decl", stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* Attach a statement to the user node that describes the IO module type.
 */
static __isl_give isl_ast_node *create_io_module_call_leaf(
    struct autosa_kernel *kernel,
    __isl_take isl_ast_node *node, struct autosa_hw_module *module,
    const char *name, __isl_keep isl_ast_build *build)
{
  isl_id *id;
  isl_ctx *ctx;
  struct autosa_kernel_stmt *stmt;

  ctx = isl_ast_node_get_ctx(node);
  stmt = isl_calloc_type(ctx, struct autosa_kernel_stmt);
  if (!stmt)
    return isl_ast_node_free(node);

  stmt->u.f.module = module;
  stmt->u.f.boundary = extract_autosa_stmt_int_field(ctx, name, 2);
  if (!prefixcmp(name, "io_module.inter_trans"))
    stmt->type = AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS;
  else if (!prefixcmp(name, "io_module.intra_trans"))
    stmt->type = AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS;
  else if (!prefixcmp(name, "io_module.inter_intra"))
    stmt->type = AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA;
  else if (!prefixcmp(name, "io_module.intra_inter"))
    stmt->type = AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER;
  else if (!prefixcmp(name, "io_module.state_handle"))
    stmt->type = AUTOSA_KERNEL_STMT_IO_MODULE_CALL_STATE_HANDLE;
  id = isl_id_alloc(ctx, name, stmt);
  id = isl_id_set_free_user(id, &autosa_kernel_stmt_free);
  if (!id)
    autosa_kernel_stmt_free(stmt);
  return isl_ast_node_set_annotation(node, id);
}

/* This function is called for each instance of a user statement
 * in the kernel. This may be one of the original user statements
 * or a statement introduced by AutoSA.
 *
 * We first check if the statement id corresponds to a autosa statement,
 * which indicates the statement is an original user statement. Any statement
 * that is not an original user statement has been introduced by AutoSA and
 * requires special handling.
 *
 * If the user statement is one of the original user statements, then we call
 * create_domain_leaf.  
 * If it is "init_device", then we call build_array_bounds.  
 * Otherwise, we check if it is a copy statement and call the appropriate 
 * functions.  
 * Statements that copy an array to/from the device do not need any 
 * further treatment. Neither does "clear_device".
 */
static __isl_give isl_ast_node *at_domain_module(__isl_take isl_ast_node *node,
                                                 __isl_keep isl_ast_build *build, void *user)
{
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_stmt *device_stmt;
  isl_ast_expr *expr, *arg;
  isl_id *id;
  int is_sync;
  const char *name;
  void *p;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  p = isl_id_get_user(id);
  isl_ast_expr_free(expr);
  isl_ast_expr_free(arg);

  device_stmt = find_stmt(data->prog, id);
  isl_id_free(id);

  if (device_stmt)
    return create_domain_leaf_module(data->kernel, node, build, device_stmt);

  if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_"))
    return node;
  if (!strcmp(name, "init_device"))
    return build_array_bounds(node, data->prog, build);
  if (!strcmp(name, "clear_device"))
    return node;
  if (!strcmp(name, "read") || !strcmp(name, "write"))
  {
    struct autosa_array_ref_group *group = (struct autosa_array_ref_group *)p;
    return create_access_leaf(data->kernel, group, node, build);
  }
  if (!prefixcmp(name, "in") || !prefixcmp(name, "out"))
  {
    struct autosa_array_ref_group_pair *pair = (struct autosa_array_ref_group_pair *)p;
    return create_io_leaf(data->kernel, data->module, pair, node, build);
  }
  if (!prefixcmp(name, "module_call"))
  {
    /* module_call.[module_name]
     * module_call_lower.[module_name]
     */
    struct autosa_array_ref_group *group = NULL;
    if (!prefixcmp(name, "module_call_lower"))
      group = (struct autosa_array_ref_group *)p;
    return create_module_call_leaf(data->kernel, node, data->module, data->pe_dummy_module, group, name, build);
  }
  if (!prefixcmp(name, "fifo_decl"))
  {
    /* fifo_decl.[fifo_name]
     * fifo_decl_boundary.[fifo_name]
     */
    struct autosa_array_ref_group *group = (struct autosa_array_ref_group *)p;
    return create_fifo_decl_leaf(data->kernel, node, data->module, group, name, build);
  }
  if (!prefixcmp(name, "ext_module"))
  {
    /* set_ext_module_args_upper.[module_name]
     * set_ext_module_args_lower.[module_name]
     */
    struct autosa_array_ref_group *group = NULL;
    if (!prefixcmp(name, "ext_module_lower"))
      group = (struct autosa_array_ref_group *)p;
    return create_ext_module_leaf(data->kernel, node, data->module,
                                  data->pe_dummy_module, group, name, build);
  }
  if (!prefixcmp(name, "io_module"))
  {
    return create_io_module_call_leaf(data->kernel, node, data->module, name, build);
  }
  if (!prefixcmp(name, "drain_merge"))
  {
    return create_drain_merge_leaf(data->kernel, data->drain_merge_func, node, build);
  }
  if (!prefixcmp(name, "serialize") || !prefixcmp(name, "deserialize"))
  {
    struct autosa_array_ref_group_pair *pair = (struct autosa_array_ref_group_pair *)p;
    return create_serialize_leaf(data->kernel, pair, node, name, build);
  }

  return node;
}

/* This function is called before the AST generator starts traversing
 * the schedule subtree of a node with mark "mark".
 *
 * If the mark is called "kernel", store the kernel pointer in data->kernel
 * for use in at_domain_module.
 * If the mark is called "module", store the kernel pointer in data->module
 * for use in at_domain_module.
 */
static isl_stat before_mark_module(__isl_keep isl_id *mark,
                                   __isl_keep isl_ast_build *build, void *user)
{
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;

  if (!mark)
    return isl_stat_error;
  if (!strcmp(isl_id_get_name(mark), "kernel"))
  {
    data->kernel = (struct autosa_kernel *)isl_id_get_user(mark);
  }
  if (!strcmp(isl_id_get_name(mark), "module"))
  {
    data->module = (struct autosa_hw_module *)isl_id_get_user(mark);
  }
  if (!strcmp(isl_id_get_name(mark), "pe_dummy_module"))
  {
    data->pe_dummy_module = (struct autosa_pe_dummy_module *)isl_id_get_user(mark);
    data->in_for = 0;
  }
  if (!strcmp(isl_id_get_name(mark), "io_module.inter_trans") ||
      !strcmp(isl_id_get_name(mark), "io_module.intra_trans"))
  {
    data->filter_buffer = 1;
    data->in_for = 0;
  }
  if (!strcmp(isl_id_get_name(mark), "hls_pipeline"))
  {
    data->under_pipeline = 1;
  }
  if (!strcmp(isl_id_get_name(mark), "hls_unroll"))
  {
    data->under_unroll = 1;
  }
  if (!strcmp(isl_id_get_name(mark), "drain_merge"))
  {
    data->drain_merge_func = (struct autosa_drain_merge_func *)isl_id_get_user(mark);
  }
  if (!strcmp(isl_id_get_name(mark), "host_serialize"))
  {
    data->module = (struct autosa_hw_module *)isl_id_get_user(mark);
  }

  return isl_stat_ok;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node. "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "module", then replace "node" by a user node
 * that "calls" the module, representing the launch of the module.
 * The original "node" is stored inside the module object so that
 * it can be used to print the device code.
 * Also clear data->module.
 */
static __isl_give isl_ast_node *after_mark_module(__isl_take isl_ast_node *node,
                                                  __isl_keep isl_ast_build *build, void *user)
{
  isl_ctx *ctx;
  isl_id *id;
  isl_ast_expr *expr;
  isl_ast_expr_list *list;
  struct autosa_kernel *kernel;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_hw_module *module;
  struct autosa_pe_dummy_module *pe_dummy_module;
  struct autosa_drain_merge_func *func;
  int tuning = data->tuning;
  int tuning_num = data->tuning_num;

  ctx = isl_ast_node_get_ctx(node);
  id = isl_ast_node_mark_get_id(node);
  if (!id)
    return isl_ast_node_free(node);

  if (!strcmp(isl_id_get_name(id), "kernel") && data->kernel)
  {
    isl_id_free(id);
    if (tuning == 0 && tuning_num == 0) {
      if (!data->kernel->space)
        data->kernel->space = isl_ast_build_get_schedule_space(build);
    }
    data->kernel = NULL;
    return node;
  }
  if (!strcmp(isl_id_get_name(id), "io_module.inter_trans"))
  {
    module = data->module;
    if (tuning) {
      if (!data->boundary)
        module->tuning_inter_tree = isl_ast_node_mark_get_node(node);
    } else if (tuning_num) {
      if (!data->boundary)
        module->tuning_num_inter_tree = isl_ast_node_mark_get_node(node);
    } else {
      if (!module->inter_space)
        module->inter_space = isl_ast_build_get_schedule_space(build);

      if (!data->boundary)
        module->inter_tree = isl_ast_node_mark_get_node(node);
      else
        module->boundary_inter_tree = isl_ast_node_mark_get_node(node);      
    }    
    isl_ast_node_free(node);

    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }
  if (!strcmp(isl_id_get_name(id), "io_module.intra_trans"))
  {
    module = data->module;
    if (tuning) {
      module->tuning_intra_tree = isl_ast_node_mark_get_node(node);
    } else if (tuning_num) {
      module->tuning_num_intra_tree = isl_ast_node_mark_get_node(node);
    } else { 
      if (!module->intra_space)
        module->intra_space = isl_ast_build_get_schedule_space(build);
      module->intra_tree = isl_ast_node_mark_get_node(node);
    }
    isl_ast_node_free(node);

    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }
  if (!strcmp(isl_id_get_name(id), "drain_merge"))
  {  
    if (tuning == 0 && tuning_num == 0) {
      func = data->drain_merge_func;
      func->device_tree = isl_ast_node_mark_get_node(node);
    }
    isl_ast_node_free(node);

    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }
  if (!strcmp(isl_id_get_name(id), "host_serialize"))
  {
    module = data->module;
    data->module = NULL;
    if (tuning == 0 && tuning_num == 0) {
      module->serialize_tree = isl_ast_node_mark_get_node(node);
    }
    isl_ast_node_free(node);

    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }
  if (!strcmp(isl_id_get_name(id), "hls_pipeline"))
  {
    isl_id_free(id);
    data->under_pipeline = 0;

    return node;
  }
  if (!strcmp(isl_id_get_name(id), "hls_unroll"))
  {
    isl_id_free(id);
    data->under_unroll = 0;

    return node;
  }
  if (strcmp(isl_id_get_name(id), "module") || !data->module)
  {
    isl_id_free(id);
    return node;
  }
  /* Prepare for boundary I/O module. */
  if (data->boundary && data->filter_buffer == 0)
  {
    module = data->module;
    data->module = NULL;
    if (tuning == 0 && tuning_num == 0) {
      module->boundary_tree = isl_ast_node_mark_get_node(node);
      if (!module->space)
        module->space = isl_ast_build_get_schedule_space(build);
    }
    
    isl_ast_node_free(node);
    
    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }

  /* Prepare for PE dummy module */
  if (data->pe_dummy && data->filter_buffer == 0)
  {
    module = data->module;
    data->module = NULL;
    if (tuning == 0 && tuning_num == 0) {
      pe_dummy_module = data->pe_dummy_module;      
      pe_dummy_module->device_tree = isl_ast_node_mark_get_node(node);
      if (!module->space)
        module->space = isl_ast_build_get_schedule_space(build);
    }
    
    data->pe_dummy_module = NULL;
    isl_ast_node_free(node);
    
    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, id);

    return node;
  }

  if (!data->boundary && data->filter_buffer == 0)
  {
    module = data->module;
    data->module = NULL;
    if (tuning) {
      module->tuning_device_tree = isl_ast_node_mark_get_node(node);
    } else if (tuning_num) {
      module->tuning_num_device_tree = isl_ast_node_mark_get_node(node);
    } else {    
      module->device_tree = isl_ast_node_mark_get_node(node);
      if (!module->space)
        module->space = isl_ast_build_get_schedule_space(build);
    }
    isl_ast_node_free(node);
    
    expr = isl_ast_expr_from_id(isl_id_copy(id));
    list = isl_ast_expr_list_alloc(ctx, 0);
    expr = isl_ast_expr_call(expr, list);
    node = isl_ast_node_alloc_user(expr);
    node = isl_ast_node_set_annotation(node, isl_id_copy(id));
  }
  isl_id_free(id);

  return node;
}

static __isl_give isl_id *before_for_module(
    __isl_keep isl_ast_build *build, void *user)
{
  isl_id *id;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_ast_node_userinfo *node_info;

  node_info = alloc_ast_node_userinfo();
  /* TODO: Update the info for Catapult HLS. */
  
  id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
  id = isl_id_set_free_user(id, free_ast_node_userinfo);

  return id;
}

//static __isl_give isl_id *before_for_module_call(
//    __isl_keep isl_ast_build *build, void *user)
//{
//  isl_id *id;
//  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
//  struct autosa_ast_node_userinfo *node_info;
//
//#ifdef _DEBUG
//  if (!strcmp(data->module->name, "U_drain_IO_L2_out")) {
//    isl_union_map *sched_tmp;
//    sched_tmp = isl_ast_build_get_schedule(build);
//    DBGUMAP(stdout, sched_tmp, data->kernel->ctx);
//  }
//#endif
//
//  node_info = alloc_ast_node_userinfo();
//  id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
//  id = isl_id_set_free_user(id, free_ast_node_userinfo);
//
//  return id;
//}

static __isl_give isl_ast_node *after_for_module(
    __isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
    void *user)
{
  isl_id *id;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_ast_node_userinfo *node_info;

  id = isl_ast_node_get_annotation(node);
  node_info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);

  //if (node_info->is_outermost_for)
  //{
  //node_info->is_outermost_for = 0;
  //data->in_for = 0;
  //}

  isl_id_free(id);

  return node;
}

/* Generate AST from the schedule for AutoSA hardware modules. 
 * If "iterator_prefix" is set, we will use it as the iterator prefix.
 * Otherwise, we use the default value "c".
 */
static __isl_give isl_ast_node *autosa_generate_ast_from_schedule(
    __isl_take isl_schedule *schedule,
    struct autosa_at_domain_data data, struct autosa_gen *gen,
    const char *iterator_prefix)
{
  isl_ast_build *build;
  isl_ast_node *tree;
  isl_id_list *iterators;
  int depth;

  if (schedule == NULL)
    return NULL;

  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
                                                  &depth) < 0)
    schedule = isl_schedule_free(schedule);
  build = isl_ast_build_alloc(gen->prog->ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth,
                                       iterator_prefix == NULL ? "c" : iterator_prefix);
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_domain_module, &data);
  build = isl_ast_build_set_before_each_mark(build, &before_mark_module, &data);
  build = isl_ast_build_set_after_each_mark(build, &after_mark_module, &data);
  build = isl_ast_build_set_before_each_for(build, &before_for_module, &data);
  build = isl_ast_build_set_after_each_for(build, &after_for_module, &data);

  if (gen->prog->scop->options->debug->dump_final_schedule)
    isl_schedule_dump(schedule);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  return tree;
}

struct loop_infinitize_check_data
{
  /* Indicates if we are checking the outermost loop bands. */
  isl_bool outer_for;
  struct autosa_hw_module *module;
  /* Indicates if we have found any infinitizable loop. */
  isl_bool found;
  /* Number of infinitizable loops. */
  int n_loops;
};

struct iterator_used_data
{
  isl_ast_expr *iterator;
  isl_bool used;
  struct autosa_hw_module *module;
  isl_bool has_inter_intra;
};

/* Search if the isl_ast_expr_id "key" exists in the ast_expr "expr".
 */
static isl_bool search_expr_id(__isl_keep isl_ast_expr *expr, __isl_keep isl_ast_expr *key)
{
  enum isl_ast_expr_type type;

  type = isl_ast_expr_get_type(expr);
  if (type == isl_ast_expr_id)
  {
    return isl_ast_expr_is_equal(expr, key);
  }
  else if (type == isl_ast_expr_int)
  {
    return isl_bool_false;
  }
  else if (type == isl_ast_expr_op)
  {
    isl_size n_arg = isl_ast_expr_op_get_n_arg(expr);
    for (int i = 0; i < n_arg; i++)
    {
      isl_ast_expr *arg = isl_ast_expr_op_get_arg(expr, i);
      isl_bool found = search_expr_id(arg, key);
      isl_ast_expr_free(arg);
      if (found == isl_bool_true)
        return isl_bool_true;
    }
  }

  return isl_bool_false;
}

struct search_id_to_expr_id_data
{
  bool found;
  isl_ast_expr *iterator;
};

isl_stat search_id_to_expr_id(__isl_take isl_id *key,
                              __isl_take isl_ast_expr *val, void *user)
{
  struct search_id_to_expr_id_data *data = (struct search_id_to_expr_id_data *)user;
  data->found = (int)search_expr_id(val, data->iterator) || data->found;  

  isl_id_free(key);
  isl_ast_expr_free(val);
  return isl_stat_ok;
}

static isl_bool iterator_used(__isl_keep isl_ast_node *node, void *user)
{
  struct iterator_used_data *data = (struct iterator_used_data *)user;
  enum isl_ast_node_type type;
  

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for)
  {
    isl_ast_expr *expr;
    isl_bool found = isl_bool_false;

    /* Init */
    expr = isl_ast_node_for_get_init(node);
    found = search_expr_id(expr, data->iterator);
    isl_ast_expr_free(expr);
    if (found)
    {
      data->used = isl_bool_true;
      return isl_bool_false;
    }

    /* Cond */
    expr = isl_ast_node_for_get_cond(node);
    found = search_expr_id(expr, data->iterator);
    isl_ast_expr_free(expr);
    if (found)
    {
      data->used = isl_bool_true;
      return isl_bool_false;
    }
  }
  else if (type == isl_ast_node_if)
  {
    isl_ast_expr *expr;
    isl_bool found = isl_bool_false;

    /* Cond */
    expr = isl_ast_node_if_get_cond(node);
    found = search_expr_id(expr, data->iterator);
    isl_ast_expr_free(expr);
    if (found)
    {
      data->used = isl_bool_true;
      return isl_bool_false;
    }
  }
  else if (type == isl_ast_node_block)
  {
    /* We do nothing here. */
    return isl_bool_true;
  }
  else if (type == isl_ast_node_mark)
  {
    /* We do nothing here. */
    return isl_bool_true;
  }
  else if (type == isl_ast_node_user)
  {
    isl_ast_expr *expr;
    isl_bool found = isl_bool_false;
    isl_id *id;
    struct autosa_kernel_stmt *stmt;

    id = isl_ast_node_get_annotation(node);
    stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
    isl_id_free(id);

    if (stmt->type == AUTOSA_KERNEL_STMT_DOMAIN)
    {
      /* TODO: At present, we only test if the array index contains the iterator.
       */
      isl_id_to_ast_expr *ref2expr = stmt->u.d.ref2expr;
      struct search_id_to_expr_id_data local_data;
      local_data.found = isl_bool_false;
      local_data.iterator = data->iterator;
      isl_id_to_ast_expr_foreach(ref2expr, &search_id_to_expr_id, &local_data);
      if (local_data.found)
      {
        data->used = isl_bool_true;
        return isl_bool_false;
      }
    }
    else if (stmt->type == AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS ||
             stmt->type == AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS ||
             stmt->type == AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA ||
             stmt->type == AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER)
    {
      isl_ast_node *nested_node;
      struct iterator_used_data nested_used_data;

      data->has_inter_intra = isl_bool_true;

      /* Search under the nested AST tree. */
      nested_node = data->module->inter_tree;
      nested_used_data.iterator = data->iterator;
      nested_used_data.used = data->used;
      nested_used_data.module = data->module;
      isl_ast_node_foreach_descendant_top_down(nested_node, &iterator_used,
                                               &nested_used_data);
      found = nested_used_data.used;
      if (found)
      {
        data->used = isl_bool_true;
        return isl_bool_false;
      }

      /* Search under the nested AST tree. */
      nested_node = data->module->intra_tree;
      nested_used_data.iterator = data->iterator;
      nested_used_data.used = data->used;
      nested_used_data.module = data->module;
      isl_ast_node_foreach_descendant_top_down(nested_node, &iterator_used,
                                               &nested_used_data);
      found = nested_used_data.used;
      if (found)
      {
        data->used = isl_bool_true;
        return isl_bool_false;
      }
    }
    else if (stmt->type == AUTOSA_KERNEL_STMT_IO_TRANSFER)
    {
      int filter_depth = stmt->u.i.filter_sched_depth;
      if (stmt->u.i.boundary)
        filter_depth = -1;
      if (filter_depth < 0)
        return isl_bool_true;

      /* Check if the iterator equals to c[filter_depth]. */
      isl_printer *p_str;
      char *filter_iterator;
      char *cur_iterator;
      p_str = isl_printer_to_str(isl_ast_node_get_ctx(node));
      p_str = isl_printer_print_str(p_str, "c");
      p_str = isl_printer_print_int(p_str, filter_depth);
      filter_iterator = isl_printer_get_str(p_str);
      p_str = isl_printer_flush(p_str);

      p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
      p_str = isl_printer_print_ast_expr(p_str, data->iterator);
      cur_iterator = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      if (!strcmp(filter_iterator, cur_iterator))
        found = isl_bool_true;
      free(filter_iterator);
      free(cur_iterator);

      if (found)
      {
        data->used = isl_bool_true;
        return isl_bool_false;
      }
    }
  }

  return isl_bool_true;
}

static isl_bool loop_infinitize_check(__isl_keep isl_ast_node *node, void *user)
{
  struct loop_infinitize_check_data *data = (struct loop_infinitize_check_data *)user;
  enum isl_ast_node_type type;

  /* Only check the for loops in the outermost loop band. */
  if (!data->outer_for)
    return isl_bool_false;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_block || type == isl_ast_node_user)
  {
    data->outer_for = isl_bool_false;
    return isl_bool_false;
  }
  if (type == isl_ast_node_for && !isl_ast_node_for_is_degenerate(node))
  {
    isl_ast_expr *iterator;
    isl_ast_node *body;
    isl_bool used = isl_bool_false;
    struct iterator_used_data used_data;
    isl_id *id;

    iterator = isl_ast_node_for_get_iterator(node);
    body = isl_ast_node_for_get_body(node);
    /* Examine if the iterator exists in any AST expressions in the sub tree. */
    used_data.iterator = iterator;
    used_data.used = isl_bool_false;
    used_data.module = data->module;
    used_data.has_inter_intra = isl_bool_false;
    isl_ast_node_foreach_descendant_top_down(body, &iterator_used, &used_data);

    if (!used_data.used)
    {
      /* This loop is legal to be infinitized. */
      struct autosa_ast_node_userinfo *node_info;

      data->n_loops++;
      id = isl_ast_node_get_annotation(node);
      if (id)
      {
        node_info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
        if (node_info)
        {
          node_info->is_infinitize_legal = 1;
          if (!data->found)
          {
            node_info->is_first_infinitizable_loop = 1;
            data->found = isl_bool_true;
          }

          if (used_data.has_inter_intra)
          {
            isl_space *space;
            int n;
            isl_printer *p_str;
            char *iterator_str;
            /* Update the inter/intra_trans module space. 
             * Remove the corresponding iterators from the sub module space. 
             */
            p_str = isl_printer_to_str(isl_id_get_ctx(id));
            p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
            p_str = isl_printer_print_ast_expr(p_str, iterator);
            iterator_str = isl_printer_get_str(p_str);
            isl_printer_free(p_str);

            space = data->module->inter_space;
            n = isl_space_find_dim_by_name(space, isl_dim_set, iterator_str);
            if (n >= 0)
              space = isl_space_drop_dims(space, isl_dim_set, n, 1);
            data->module->inter_space = space;

            space = data->module->intra_space;
            n = isl_space_find_dim_by_name(space, isl_dim_set, iterator_str);
            if (n >= 0)
              space = isl_space_drop_dims(space, isl_dim_set, n, 1);
            data->module->intra_space = space;

            free(iterator_str);
          }
        }
        isl_id_free(id);
      }
    }
    else
    {
      /* Stop from here. */
      isl_ast_expr_free(iterator);
      isl_ast_node_free(body);
      return isl_bool_false;
    }

    isl_ast_expr_free(iterator);
    isl_ast_node_free(body);
  }

  return isl_bool_true;
}

/* Try to apply the loop infinitization optimization.
 * This optimization is useful for Intel devices since we can remove some 
 * for loops with a simple while (1) loop to reduce the loop control overheads.
 * We will examine the outermost for loop band from outside to inside.
 * For each for loop, we examine if the loop iterator appears in any AST
 * expression below. If not, this loop will be marked to be infinitized later.
 * When printing out for loops later, such loops will be skipped. 
 * Since we use the nested AST for module ASTs, we examine the 
 * module->tree.
 * If we encounter any AST node calling io_module.inter_trans/io_module.intra_trans,
 * we will search from module->intra_tree and module->inter_tree
 * otherwise, we will search from module->device_tree.
 */
static void loop_infinitization_optimize(struct autosa_hw_module *module)
{
  if (module->double_buffer || module->to_mem)
    return;

  if (module->device_tree)
  {
    isl_ast_node *node = module->device_tree;
    struct loop_infinitize_check_data data = {isl_bool_true, module, isl_bool_false};
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
  }
  if (module->boundary_tree)
  {
    isl_ast_node *node = module->boundary_tree;
    struct loop_infinitize_check_data data = {isl_bool_true, module, isl_bool_false};
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
  }
}

/* Mark all for loop as visited.  
 */
static isl_bool update_for_visit(__isl_keep isl_ast_node *node, void *user)
{
  enum isl_ast_node_type type;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for)
  {
    struct autosa_ast_node_userinfo *info;
    isl_id *id;

    id = isl_ast_node_get_annotation(node);
    if (id)
    {
      info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
      info->visited = 1;
    }
    isl_id_free(id);
  }

  return isl_bool_true;
}

struct count_loop_data {
  int pe;
  int io;
  int under_simd;
  int find_simd_loop;
  int n_loop;
  int under_latency;  
  int find_latency_loop;
  int n_latency_loop;  
};

static isl_bool count_loop(__isl_keep isl_ast_node *node, void *user)
{
  struct count_loop_data *data = (struct count_loop_data *)user;
  enum isl_ast_node_type type;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for) {
    data->n_loop++;        
    if (data->pe) {
      if (data->under_simd) {
        data->find_simd_loop = 1;      
      }
      if (data->under_latency) {
        data->n_latency_loop++;
      }
    }
  } else if (type == isl_ast_node_mark) {
    isl_id *id;
    id = isl_ast_node_mark_get_id(node);    
    if (!strcmp(isl_id_get_name(id), "simd")) {
      data->under_simd = 1;
    } 
    if (!strcmp(isl_id_get_name(id), "latency")) {
      data->under_latency = 1;
    }
    isl_id_free(id);
  }

  return isl_bool_true;
}

struct loop_coalesce_update_data {
  int update_level_for_pe;
  int update_level_for_io;
};

static isl_bool update_latency_coalesce(__isl_keep isl_ast_node *node, void *user)
{
  struct count_loop_data *data = (struct count_loop_data *)user;
  enum isl_ast_node_type type;
  
  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for) {
    if (data->under_latency && data->find_latency_loop == 0) {
      struct autosa_ast_node_userinfo *info;
      isl_id *id;
            
      id = isl_ast_node_get_annotation(node);
      if (id) {
        info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);       
        info->n_coalesce_loop = data->n_latency_loop - ((data->find_simd_loop == 1)? 1 : 0);        
      }
      isl_id_free(id);
      data->find_latency_loop = 1;
    }
  } else if (type == isl_ast_node_mark) {
    isl_id *id;
    id = isl_ast_node_mark_get_id(node);    
    if (!strcmp(isl_id_get_name(id), "latency")) {
      data->under_latency = 1;
    }
    isl_id_free(id);
  }

  return isl_bool_true;
}

/* If the ast node is a for loop node, we will first extract the annonated 
 * userinfo from the node. If the loop is marked to be infinitized, we will 
 * skip this loop.
 * Otherwise, since we visit the AST in top-down manner, this is the outermost 
 * loop to be added with the loop_coalesce pragma.
 * We will mark all the chidren nodes of this node as visited.
 * Next time when we first meet an unvisited for node, that will be the other
 * outermost loop to be annodated. 
 * 
 * If the module is PE module or intra_trans I/O module with data pack, 
 * we will also update the for loop levels beneath the current for node.
 */
static isl_bool loop_coalesce_update(__isl_keep isl_ast_node *node, void *user)
{
  struct loop_coalesce_update_data *data = (struct loop_coalesce_update_data *)user;
  enum isl_ast_node_type type;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for)
  {
    struct autosa_ast_node_userinfo *info;
    isl_id *id;

    id = isl_ast_node_get_annotation(node);
    if (id)
    {
      info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
      if (info && !info->is_infinitize_legal && !info->visited)
      {
        /* This is the outermost loop to be coalesced. 
         * We will then visit all the children nodes and add the visit flag.
         */
        info->visited = 1;
        info->is_outermost_for = 1;
        /* Update the children. */
        isl_ast_node_foreach_descendant_top_down(node, &update_for_visit, NULL);
        if (data->update_level_for_io) {
          info->is_dep_free = 1;
        } else if (data->update_level_for_pe) {
          struct count_loop_data tmp_data = 
            {data->update_level_for_pe, data->update_level_for_io, 0, 0, 0, 0, 0, 0};
          isl_ast_node_foreach_descendant_top_down(node, &count_loop, &tmp_data);
          if (tmp_data.pe && tmp_data.find_simd_loop) {          
            info->n_coalesce_loop = tmp_data.n_loop - tmp_data.n_latency_loop; 
            /* Update the coalesce info for the latency hiding loop */
            tmp_data.under_latency = 0;
            tmp_data.find_latency_loop = 0;            
            isl_ast_node_foreach_descendant_top_down(node, &update_latency_coalesce, &tmp_data);
          } else if (tmp_data.io) {
            info->n_coalesce_loop = tmp_data.n_loop - 1;
          } else {
            info->n_coalesce_loop = 0;
          }          
        }
      }
      isl_id_free(id);
    }
  }

  return isl_bool_true;
}

/* This function will mark the outermost for loop which is not infinitized 
 * to be added with "loop_coalesce" pragma later in the generated OpenCL code.
 * We will examine all the AST trees to be printed for this module.
 */
static void loop_coalesce_optimize(struct autosa_hw_module *module)
{
  isl_ast_node *node;
  struct loop_coalesce_update_data data = {0, 0};
  if (module->type == PE_MODULE)
    data.update_level_for_pe = 1;      

  if (module->device_tree)
  {
    node = module->device_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
  }
  if (module->inter_tree)
  {
    node = module->inter_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
  }
  if (module->intra_tree)
  {
    if (module->data_pack_inter != module->data_pack_intra && module->in == 0)
      data.update_level_for_io = 1;
    node = module->intra_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
    data.update_level_for_io = 0;
  }
  if (module->boundary_outer_tree)
  {
    node = module->boundary_outer_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
  }
  if (module->boundary_inter_tree)
  {
    node = module->boundary_inter_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
  }
  if (module->boundary_tree)
  {
    node = module->boundary_tree;
    isl_ast_node_foreach_descendant_top_down(node, &loop_coalesce_update, &data);
  }
}

struct loop_guards_update_data {
  /* Indicates if we are checking the outermost loop bands. */
  isl_bool outer_for;
  struct autosa_hw_module *module;
  /* Indicates if we have found any infinitizable loop. */
  isl_bool found;
  /* Number of infinitizable loops. */
  int n_loops;
  int start_updated;
  int end_updated;
  /* Store the last for loop info. */
  struct autosa_ast_node_userinfo *info;
  int module_type; // default: 0 outer: 1 intra: 2 inter: 3
  int double_buffer;
  char *module_name;
  char *buf_name;
  int inter;
  int read;
};

/* We mark the guard_start at the outermost for loop.
 * As for the guard_end, we mark it at the last for loop before the double buffer mark
 * for inter/intra trans module, 
 * for the rest, we mark it at the last infinitizable loop.
 */
static isl_bool loop_guards_update(__isl_keep isl_ast_node *node, void *user)
{
  struct loop_guards_update_data *data = (struct loop_guards_update_data *)user;
  enum isl_ast_node_type type;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for) {
    struct autosa_ast_node_userinfo *info;
    isl_id *id;

    if (data->end_updated) {
      /* Count the loops inside the guards. */
      data->n_loops++;
    } else {
      data->n_loops--;
    }

    id = isl_ast_node_get_annotation(node);
    if (id) {
      info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
      if (!data->end_updated)
        info->visited = true;      

      if (info && !data->start_updated) {
        data->start_updated = 1;
        info->is_guard_start = 1;        
      }
      if (info && info->is_infinitize_legal && !data->end_updated) {
        /* This is the first loop that can't be infinitized */        
        if (data->n_loops == 0) {
          info->is_guard_end = 1;
          data->end_updated = 1;          
          /* Update the local buffer information if needed. */
          if (data->module_type == 2 || data->module_type == 3) {
            info->double_buffer = data->double_buffer;
            info->module_name = data->module_name;
            info->inter = data->module_type - 2;
            info->read = data->read;
            info->buf_name = data->buf_name;
          } else {
            info->double_buffer = -1;
            info->module_name = NULL;
            info->inter = -1;
            info->read = -1;
            info->buf_name = NULL;
          }
        }
      }      
      data->info = info;
    } 
    isl_id_free(id);
  } else if (type == isl_ast_node_mark) {
    isl_id *id = isl_ast_node_mark_get_id(node);
    const char *name = isl_id_get_name(id);
    if (!strcmp(name, "synth")) {
      data->info->is_guard_end = 1;
      data->end_updated = 1;
      data->n_loops = 0;
      if (data->module_type == 2 || data->module_type == 3) {
        data->info->double_buffer = data->double_buffer;
        data->info->module_name = data->module_name;
        data->info->inter = data->module_type - 2;
        data->info->read = data->read;
        data->info->buf_name = data->buf_name;
      }
    }
    isl_id_free(id);
  }
  
  return isl_bool_true;
}

static isl_bool loop_pipeline_update(__isl_keep isl_ast_node *node, void *user)
{
  enum isl_ast_node_type type;

  type = isl_ast_node_get_type(node);
  if (type == isl_ast_node_for) {
    struct autosa_ast_node_userinfo *info;
    isl_id *id;

    id = isl_ast_node_get_annotation(node);
    if (id) {
      info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
      if (info && !info->visited) {
        /* This is the outermost loop to be pipelined.
         * We will visit all the children nodes and update hte visit flag.
         */
        info->visited = 1;
        info->is_pipeline = 1;
        /* Update the children. */
        isl_ast_node_foreach_descendant_top_down(node, &update_for_visit, NULL);
      }      
    }
    isl_id_free(id);
  }

  return isl_bool_true;
}

/* Mark the loop_guard_start before the outermost loop. 
 * Store the fifo guards information 
 * - name of fifos:
 * - number of elements to be read
 * Mark the loop guard_end.
 * - For inter/intra module, mark it at the end of the outer loop.
 *   Store the infomation about
 *   - module name
 *   - buffer name
 *   - fifo name
 * - For other modules, put it after the last loop in the outermost loop band.
 */
static void loop_guards_optimize(struct autosa_hw_module *module)
{    
  /* Mark the loop guard start before the outermost loop. */
  if (module->device_tree) {    
    isl_ast_node *node = module->device_tree;
    struct loop_guards_update_data data = 
      {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 0};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    data.buf_name = NULL;
    data.inter = -1;
    data.read = -1;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);
    if (data.n_loops == 0)
      module->pipeline_at_default_func = 1;
    else {      
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }
  if (module->inter_tree) {    
    isl_ast_node *node = module->inter_tree;
    struct loop_guards_update_data data = {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 3};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    if (module->n_var > 0) {
      data.buf_name = (&(module->var[0]))->name;
    } else {
      data.buf_name = NULL;
    }
    data.inter = 1;
    data.read = (module->in)? 0 : 1;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);    
    if (data.n_loops == 0) {
      module->pipeline_at_filter_func[2] = 1;      
    } else {
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }
  if (module->intra_tree) {
    isl_ast_node *node = module->intra_tree;
    struct loop_guards_update_data data = {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 2};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    if (module->n_var > 0) {
      data.buf_name = (&(module->var[0]))->name;
    } else {
      data.buf_name = NULL;
    }
    data.inter = 0;
    data.read = (module->in)? 1 : 0;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);        
    if (data.n_loops == 0) {
      module->pipeline_at_filter_func[1] = 1;          
    } else {
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }
  if (module->boundary_outer_tree) {    
    isl_ast_node *node = module->boundary_outer_tree;
    struct loop_guards_update_data data = {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 1};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    data.buf_name = NULL;
    data.inter = -1;
    data.read = -1;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);
    if (data.n_loops != 0) {
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }
  if (module->boundary_inter_tree) {    
    isl_ast_node *node = module->boundary_inter_tree;
    struct loop_guards_update_data data = {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 3};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    if (module->n_var > 0) {
      data.buf_name = (&(module->var[0]))->name;
    } else {
      data.buf_name = NULL;
    }
    data.inter = 1;
    data.read = (module->in)? 0 : 1;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);
    if (data.n_loops != 0) {
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }
  if (module->boundary_tree) {    
    isl_ast_node *node = module->boundary_tree;
    struct loop_guards_update_data data = {isl_bool_true, module, isl_bool_false, 0, 0, 0, NULL, 0};
    data.double_buffer = module->double_buffer;
    data.module_name = module->name;
    data.buf_name = NULL;
    data.inter = -1;
    data.read = -1;
    isl_ast_node_foreach_descendant_top_down(node, &loop_infinitize_check, &data);
    isl_ast_node_foreach_descendant_top_down(node, &loop_guards_update, &data);
    if (data.n_loops != 0) {
      /* Find the first for loop under the guard_end. Mark it as pipeline. */
      isl_ast_node_foreach_descendant_top_down(node, &loop_pipeline_update, NULL);
    }
  }

  return;
}

/* If marker is not the following, delete it.
 * kernel, module, pe_dummy_module, 
 * io_module.inter_trans, io_module.intra_trans,
 * hls_pipeline, hls_unroll,
 * drain_merge, host_serialize
 */
static __isl_give isl_schedule_node *delete_marker_catapult(
  __isl_take isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark) {
    isl_id *id;
    const char *name;
    id = isl_schedule_node_mark_get_id(node);
    name = isl_id_get_name(id);
    isl_id_free(id);
    if (!(!strcmp(name, "kernel") || !strcmp(name, "module") || !strcmp(name, "pe_dummy_module") ||
        !strcmp(name, "io_module.inter_trans") || !strcmp(name, "io_module.intra_trans") || 
        !strcmp(name, "hls_pipeline") || !strcmp(name, "hls_unroll") ||
        !strcmp(name, "drain_merge") || !strcmp(name, "host_serialize") ||
        !strcmp(name, "synth")))
    {
      /* Delete the current marker. */
      node = isl_schedule_node_delete(node);
    }
  }
  return node;
}

/* There are three schedules to handle in this module:
 * - outer loop schedule
 * - inter trans schedule
 * - intra trans schedule
 * We will first generate AST for inter trans function and intra trans function.
 * The AST trees below the inter trans and intra trans mark are stored 
 * seperately.
 * The outer loop AST will print out these two AST trees while handling 
 * the inter trans and intra trans function calls.
 */
isl_stat sa_filter_buffer_io_module_generate_code(struct autosa_gen *gen,
                                                  struct autosa_hw_module *module)
{
  isl_schedule *schedule;
  struct autosa_at_domain_data data;
  isl_ast_node *tree;

  /* Generate AST for inter transfer function call. */
  schedule = module->inter_sched;
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Delete the unnecessary marker. */
    schedule = isl_schedule_map_schedule_node_bottom_up(
      schedule, &delete_marker_catapult, NULL);
  }
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                           module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
  isl_ast_node_free(tree);
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_inter_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 1;
    data.tuning_num = 0;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    isl_ast_node_free(tree);
  }
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_num_inter_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 0;
    data.tuning_num = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    isl_ast_node_free(tree);
  }

  if (module->boundary)
  {
    /* Generate boundary module AST. */
    schedule = module->boundary_inter_sched;
    if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
      /* Delete the unnecessary marker. */
      schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &delete_marker_catapult, NULL);
    }
    autosa_at_domain_data_init(&data, gen);
    data.boundary = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    isl_ast_node_free(tree);
  }

  /* Generate AST for intra transfer function call. */
  schedule = module->intra_sched;  
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Delete the unnecessary marker. */
    schedule = isl_schedule_map_schedule_node_bottom_up(
      schedule, &delete_marker_catapult, NULL);
  }
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                           module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "intra_c" : NULL);
  isl_ast_node_free(tree);
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_intra_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 1;
    data.tuning_num = 0;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    isl_ast_node_free(tree);
  }
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_num_intra_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 0;
    data.tuning_num = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    isl_ast_node_free(tree);
  }

  /* Generate AST for outer loop function call. */
  schedule = module->outer_sched;  
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Delete the unnecessary marker. */
    schedule = isl_schedule_map_schedule_node_bottom_up(
      schedule, &delete_marker_catapult, NULL);
  }
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                           module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "outer_c" : NULL);
  module->tree = tree;
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_outer_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 1;
    data.tuning_num = 0;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    module->tuning_tree = tree;
  }
  if (gen->options->autosa->tuning_method == 1 && module->tuning_inter_sched) {
    schedule = module->tuning_num_outer_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 0;
    data.tuning_num = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "inter_c" : NULL);
    module->tuning_num_tree = tree;
  }

  if (module->boundary)
  {
    /* Generate boundary module AST. */
    schedule = module->boundary_outer_sched;    
    if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
      /* Delete the unnecessary marker. */
      schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &delete_marker_catapult, NULL);
    }
    autosa_at_domain_data_init(&data, gen);
    data.boundary = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen,
                                             module->double_buffer && gen->options->autosa->double_buffer_style == 0 ? "outer_c" : NULL);
    isl_ast_node_free(tree);
  }

  /* Perform loop infinitization optimization. */
  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL &&
      gen->options->autosa->loop_infinitize)
  {
    loop_infinitization_optimize(module);
  }
  /* Perform loop coalesce optimization. 
   * This step should be always after the loop infinitization opt.
   */
  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL)
  {
    loop_coalesce_optimize(module);
  }
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) 
  {    
    loop_guards_optimize(module);    
  }

  return isl_stat_ok;
}

/* Use isl to generate code for host data serialization/deserialization. 
 */
isl_stat sa_host_serialize_generate_code(struct autosa_gen *gen,
                                         struct autosa_hw_module *module)
{
  isl_schedule *schedule;
  struct autosa_at_domain_data data;
  isl_ast_node *tree;

  schedule = module->serialize_sched;
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
  isl_ast_node_free(tree);

  return isl_stat_ok;
}

/* Use isl to generate code for the hw module from "schedule".
 * The device code of the hw module is marked by "module" mark nodes in the 
 * schedule tree, containing a pointer to a autosa_hw_module object.
 * The returned AST only contains the AST for the host code.
 * The ASTs for the device code are embedded in autosa_hw_module objects
 * attached to the leaf nodes that call "module".
 */
isl_stat sa_module_generate_code(struct autosa_gen *gen,
                                 struct autosa_hw_module *module)
{
  isl_schedule *schedule;
  struct autosa_at_domain_data data;
  isl_ast_node *tree;

  schedule = module->sched;  
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
    /* Delete the unnecessary marker. */
    schedule = isl_schedule_map_schedule_node_bottom_up(
      schedule, &delete_marker_catapult, NULL);
  }
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
  module->tree = tree;
  if (gen->options->autosa->tuning_method == 1 && module->tuning_sched) {
    /* Generate the tuning AST. */    
    schedule = module->tuning_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 1;
    data.tuning_num = 0;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
    module->tuning_tree = tree;
  }
  if (gen->options->autosa->tuning_method == 1 && module->tuning_num_sched) {
    schedule = module->tuning_num_sched;
    autosa_at_domain_data_init(&data, gen);
    data.tuning = 0;
    data.tuning_num = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
    module->tuning_num_tree = tree;    
  }

  if (module->boundary)
  {
    /* Generate boundary module AST */
    schedule = module->boundary_sched;
    if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) {
      /* Delete the unnecessary marker. */
      schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &delete_marker_catapult, NULL);
    } 
    autosa_at_domain_data_init(&data, gen);
    data.boundary = 1;
    tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
    isl_ast_node_free(tree);
  }

  if (module->n_pe_dummy_modules > 0)
  {
    /* Generate dummy module AST */
    for (int i = 0; i < module->n_pe_dummy_modules; i++)
    {
      struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[i];
      schedule = dummy_module->sched;
      autosa_at_domain_data_init(&data, gen);
      data.pe_dummy = 1;
      data.pe_dummy_module = dummy_module;
      tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
      isl_ast_node_free(tree);
    }
  }

  /* Perform loop infinitization optimization. */
  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL &&
      gen->options->autosa->loop_infinitize)
  {
    loop_infinitization_optimize(module);
  }
  /* Perform loop coalesce optimization. 
   * This step should be always after the loop infinitization opt.
   */
  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL)
  {
    loop_coalesce_optimize(module);
  }
  /* Mark the loop guards. */
  if (gen->options->target == AUTOSA_TARGET_CATAPULT_HLS_C) 
  {
    loop_guards_optimize(module);
  }

  return isl_stat_ok;
}

isl_stat sa_drain_merge_generate_code(struct autosa_gen *gen,
                                      struct autosa_drain_merge_func *func)
{
  isl_schedule *schedule;
  struct autosa_at_domain_data data;
  isl_ast_node *tree;

  schedule = func->sched;
  autosa_at_domain_data_init(&data, gen);
  tree = autosa_generate_ast_from_schedule(schedule, data, gen, NULL);
  func->tree = tree;

  return isl_stat_ok;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node. "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "fifo_decl", then replace "node" by a user node
 * that "calls" the fifo_decl, representing the printing of fifo decls.
 * We will store the AST node into the fifo_decl_wrapped_trees.
 */
static __isl_give isl_ast_node *after_mark_fifo_decl(
    __isl_take isl_ast_node *node,
    __isl_keep isl_ast_build *build, void *user)
{
  isl_ctx *ctx;
  isl_id *id;
  isl_ast_expr *expr;
  isl_ast_expr_list *list;
  struct autosa_kernel *kernel;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_hw_module *module;
  struct autosa_hw_top_module *top;

  ctx = isl_ast_node_get_ctx(node);
  id = isl_ast_node_mark_get_id(node);
  if (!id)
    return isl_ast_node_free(node);

  if (!strcmp(isl_id_get_name(id), "kernel") && data->kernel)
  {
    isl_id_free(id);
    if (!data->kernel->space)
      data->kernel->space = isl_ast_build_get_schedule_space(build);
    data->kernel = NULL;
    return node;
  }
  if (strcmp(isl_id_get_name(id), "module") || !data->module)
  {
    isl_id_free(id);
    return node;
  }
  top = data->top;
  data->top = NULL;
  top->n_fifo_decl_wrapped++;
  top->fifo_decl_wrapped_trees = (isl_ast_node **)realloc(
      top->fifo_decl_wrapped_trees,
      top->n_fifo_decl_wrapped * sizeof(isl_ast_node *));
  top->fifo_decl_wrapped_trees[top->n_fifo_decl_wrapped - 1] =
      isl_ast_node_mark_get_node(node);
  isl_ast_node_free(node);

  expr = isl_ast_expr_from_id(isl_id_copy(id));
  list = isl_ast_expr_list_alloc(ctx, 0);
  expr = isl_ast_expr_call(expr, list);
  node = isl_ast_node_alloc_user(expr);
  node = isl_ast_node_set_annotation(node, id);

  return node;
}

/* Generate code for declaring fifos given the input schedule "schedule". 
 */
__isl_give isl_ast_node *sa_fifo_decl_generate_code(
    struct autosa_gen *gen, __isl_take isl_schedule *schedule)
{
  struct autosa_at_domain_data data;
  isl_ast_build *build;
  isl_ast_node *tree;
  isl_id_list *iterators;

  int depth;

  if (schedule == NULL)
    return NULL;

  data.prog = gen->prog;
  data.kernel = NULL;
  data.module = NULL;
  data.top = gen->hw_top_module;

  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
                                                  &depth) < 0)
    schedule = isl_schedule_free(schedule);
  build = isl_ast_build_alloc(gen->prog->ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_domain_module, &data);
  build = isl_ast_build_set_before_each_mark(build, &before_mark_module, &data);
  build = isl_ast_build_set_after_each_mark(build, &after_mark_fifo_decl, &data);
  if (gen->prog->scop->options->debug->dump_final_schedule)
    isl_schedule_dump(schedule);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  return tree;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node. "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "module call", then replace "node" by a user node
 * that "calls" the module call, representing the printing of module calls.
 * We will store the AST node into the module_call_wrapped_trees.
 */
static __isl_give isl_ast_node *after_mark_module_call(
    __isl_take isl_ast_node *node,
    __isl_keep isl_ast_build *build, void *user)
{
  isl_ctx *ctx;
  isl_id *id;
  isl_ast_expr *expr;
  isl_ast_expr_list *list;
  struct autosa_kernel *kernel;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_hw_module *module;
  struct autosa_hw_top_module *top;

  ctx = isl_ast_node_get_ctx(node);
  id = isl_ast_node_mark_get_id(node);
  if (!id)
    return isl_ast_node_free(node);

  if (!strcmp(isl_id_get_name(id), "kernel") && data->kernel)
  {
    isl_id_free(id);
    if (!data->kernel->space)
      data->kernel->space = isl_ast_build_get_schedule_space(build);
    data->kernel = NULL;
    return node;
  }
  if (strcmp(isl_id_get_name(id), "module") || !data->module)
  {
    isl_id_free(id);
    return node;
  }
  top = data->top;
  data->top = NULL;
  top->n_module_call_wrapped++;
  top->module_call_wrapped_trees = (isl_ast_node **)realloc(
      top->module_call_wrapped_trees,
      top->n_module_call_wrapped * sizeof(isl_ast_node *));
  top->module_call_wrapped_trees[top->n_module_call_wrapped - 1] =
      isl_ast_node_mark_get_node(node);
  isl_ast_node_free(node);

  expr = isl_ast_expr_from_id(isl_id_copy(id));
  list = isl_ast_expr_list_alloc(ctx, 0);
  expr = isl_ast_expr_call(expr, list);
  node = isl_ast_node_alloc_user(expr);
  node = isl_ast_node_set_annotation(node, id);

  return node;
}

/* Generate code for calling modules given the input schedule "schedule". 
 */
__isl_give isl_ast_node *sa_module_call_generate_code(
    struct autosa_gen *gen, __isl_take isl_schedule *schedule)
{
  struct autosa_at_domain_data data;
  isl_ast_build *build;
  isl_ast_node *tree;
  isl_id_list *iterators;

  int depth;

  if (schedule == NULL)
    return NULL;

  data.prog = gen->prog;
  data.kernel = NULL;
  data.module = NULL;
  data.pe_dummy_module = NULL;
  data.top = gen->hw_top_module;

  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
                                                  &depth) < 0)
    schedule = isl_schedule_free(schedule);
  build = isl_ast_build_alloc(gen->prog->ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_domain_module, &data);
  build = isl_ast_build_set_before_each_mark(build, &before_mark_module, &data);
  build = isl_ast_build_set_after_each_mark(build, &after_mark_module_call, &data);
  //build = isl_ast_build_set_before_each_for(build, &before_for_module_call, &data);
  if (gen->prog->scop->options->debug->dump_final_schedule)
    isl_schedule_dump(schedule);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  return tree;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node. "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "module call", then replace "node" by a user node
 * that "calls" the module call, representing the printing of module calls.
 * We will store the AST node into the module_call_wrapped_trees.
 */
static __isl_give isl_ast_node *after_mark_ext_module(
    __isl_take isl_ast_node *node,
    __isl_keep isl_ast_build *build, void *user)
{
  isl_ctx *ctx;
  isl_id *id;
  isl_ast_expr *expr;
  isl_ast_expr_list *list;
  struct autosa_kernel *kernel;
  struct autosa_at_domain_data *data = (struct autosa_at_domain_data *)user;
  struct autosa_hw_module *module;
  struct autosa_hw_top_module *top;

  ctx = isl_ast_node_get_ctx(node);
  id = isl_ast_node_mark_get_id(node);
  if (!id)
    return isl_ast_node_free(node);

  if (!strcmp(isl_id_get_name(id), "kernel") && data->kernel)
  {
    isl_id_free(id);
    if (!data->kernel->space)
      data->kernel->space = isl_ast_build_get_schedule_space(build);
    data->kernel = NULL;
    return node;
  }
  if (strcmp(isl_id_get_name(id), "module") || !data->module)
  {
    isl_id_free(id);
    return node;
  }
  top = data->top;
  data->top = NULL;
  top->n_ext_module_wrapped++;
  top->ext_module_wrapped_trees = (isl_ast_node **)realloc(
      top->ext_module_wrapped_trees,
      top->n_ext_module_wrapped * sizeof(isl_ast_node *));
  top->ext_module_wrapped_trees[top->n_ext_module_wrapped - 1] =
      isl_ast_node_mark_get_node(node);
  isl_ast_node_free(node);

  expr = isl_ast_expr_from_id(isl_id_copy(id));
  list = isl_ast_expr_list_alloc(ctx, 0);
  expr = isl_ast_expr_call(expr, list);
  node = isl_ast_node_alloc_user(expr);
  node = isl_ast_node_set_annotation(node, id);

  return node;
}

/* Generate code for setting arguments of the io modules connected to the 
 * external memory given the input schedule "schedule". 
 */
__isl_give isl_ast_node *sa_set_ext_module_args_generate_code(
    struct autosa_gen *gen, __isl_take isl_schedule *schedule)
{
  struct autosa_at_domain_data data;
  isl_ast_build *build;
  isl_ast_node *tree;
  isl_id_list *iterators;

  int depth;

  if (schedule == NULL)
    return NULL;

  data.prog = gen->prog;
  data.kernel = NULL;
  data.module = NULL;
  data.pe_dummy_module = NULL;
  data.top = gen->hw_top_module;

  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
                                                  &depth) < 0)
    schedule = isl_schedule_free(schedule);
  build = isl_ast_build_alloc(gen->prog->ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_domain_module, &data);
  build = isl_ast_build_set_before_each_mark(build, &before_mark_module, &data);
  build = isl_ast_build_set_after_each_mark(build,
                                            &after_mark_ext_module, &data);
  if (gen->prog->scop->options->debug->dump_final_schedule)
    isl_schedule_dump(schedule);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  return tree;
}

/* Generate AST for module calls and fifo decls in the top module.
 */
isl_stat sa_top_module_generate_code(struct autosa_gen *gen)
{
  struct autosa_hw_top_module *top = gen->hw_top_module;
  /* fifo declaration */
  top->fifo_decl_trees = (isl_ast_node **)malloc(
      top->n_fifo_decls * sizeof(isl_ast_node *));
  for (int i = 0; i < top->n_fifo_decls; i++)
  {
    top->fifo_decl_trees[i] = sa_fifo_decl_generate_code(gen,
                                                         top->fifo_decl_scheds[i]);
  }

  /* module call */
  top->module_call_trees = (isl_ast_node **)malloc(
      top->n_module_calls * sizeof(isl_ast_node *));
  for (int i = 0; i < top->n_module_calls; i++)
  {
    top->module_call_trees[i] = sa_module_call_generate_code(gen,
                                                             top->module_call_scheds[i]);
  }

  if (gen->options->target == AUTOSA_TARGET_INTEL_OPENCL)
  {
    top->ext_module_trees = (isl_ast_node **)malloc(
        top->n_ext_module * sizeof(isl_ast_node *));
    for (int i = 0; i < top->n_ext_module; i++)
    {
      top->ext_module_trees[i] = sa_set_ext_module_args_generate_code(gen,
                                                                      top->ext_module_scheds[i]);
    }

    //    for (int i = 0; i < top->n_ext_module; i++) {
    //      isl_ast_node_free(top->ext_module_trees[i]);
    //      isl_ast_node_free(top->ext_module_wrapped_trees[i]);
    //    }
    //    free(top->ext_module_trees);
    //    free(top->ext_module_wrapped_trees);
    //    top->ext_module_trees = NULL;
    //    top->ext_module_wrapped_trees = NULL;
    //    top->n_ext_module = 0;
  }

  return isl_stat_ok;
}

/* Representation of a statement inside a generated AST.
 *
 * "stmt" refers to the original statement.
 * "ref2expr" maps the reference identifier of each access in
 * the statement to an AST expression that should be printed
 * at the place of the access.
 */
struct ppcg_stmt {
	struct pet_stmt *stmt;

	isl_id_to_ast_expr *ref2expr;
};

static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  __isl_keep isl_ast_node *node, void *user)
{
	struct ppcg_stmt *stmt;
	isl_id *id;
  const char *stmt_name;

	id = isl_ast_node_get_annotation(node);
	stmt = (struct ppcg_stmt *)isl_id_get_user(id);
  stmt_name = isl_id_get_name(id);
	isl_id_free(id);

  if (stmt)
	  p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);
  else
    p = isl_printer_print_str(p, stmt_name);

	isl_ast_print_options_free(print_options);
  return p;
}

///* Set *depth (initialized to 0 by the caller) to the maximum
// * of the schedule depths of the leaf nodes for which this function is called.
// */
//static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
//{
//	int *depth = (int *)user;
//	int node_depth;
//
//	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
//		return isl_bool_true;
//	node_depth = isl_schedule_node_get_schedule_depth(node);
//	if (node_depth > *depth)
//		*depth = node_depth;
//
//	return isl_bool_false;
//}

/* Find the element in scop->stmts that has the given "id".
 */
static struct pet_stmt *pet_find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
{
	int i;

	for (i = 0; i < scop->pet->n_stmt; ++i) {
		struct pet_stmt *stmt = scop->pet->stmts[i];
		isl_id *id_i;

		id_i = isl_set_get_tuple_id(stmt->domain);
		isl_id_free(id_i);

		if (id_i == id)
			return stmt;
	}

	isl_die(isl_id_get_ctx(id), isl_error_internal,
		"statement not found", return NULL);
}

/* Index transformation callback for pet_stmt_build_ast_exprs.
 *
 * "index" expresses the array indices in terms of statement iterators
 * "iterator_map" expresses the statement iterators in terms of
 * AST loop iterators.
 *
 * The result expresses the array indices in terms of
 * AST loop iterators.
 */
static __isl_give isl_multi_pw_aff *pullback_index(
	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
{
	isl_pw_multi_aff *iterator_map = (isl_pw_multi_aff *)user;

	iterator_map = isl_pw_multi_aff_copy(iterator_map);
	return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
}

static void ppcg_stmt_free(void *user)
{
	struct ppcg_stmt *stmt = (struct ppcg_stmt *)user;

	if (!stmt)
		return;

	isl_id_to_ast_expr_free(stmt->ref2expr);

	free(stmt);
}

/* Transform the accesses in the statement associated to the domain
 * called by "node" to refer to the AST loop iterators, construct
 * corresponding AST expressions using "build",
 * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
 */
static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build, void *user)
{
	struct ppcg_scop *scop = (struct ppcg_scop *)user;
	isl_ast_expr *expr, *arg;
	isl_ctx *ctx;
	isl_id *id;
	isl_map *map;
	isl_pw_multi_aff *iterator_map;
	struct ppcg_stmt *stmt;  

	ctx = isl_ast_node_get_ctx(node);
	stmt = isl_calloc_type(ctx, struct ppcg_stmt);
	if (!stmt)
		goto error;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	isl_ast_expr_free(expr);
	id = isl_ast_expr_get_id(arg);
	isl_ast_expr_free(arg);
	stmt->stmt = pet_find_stmt(scop, id);
	isl_id_free(id);
	if (!stmt->stmt)
    ppcg_stmt_free(stmt);
    return node;
		//goto error;

	map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
	map = isl_map_reverse(map);
	iterator_map = isl_pw_multi_aff_from_map(map);
	stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
				    &pullback_index, iterator_map, NULL, NULL);
	isl_pw_multi_aff_free(iterator_map);

	id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
	id = isl_id_set_free_user(id, &ppcg_stmt_free);
	return isl_ast_node_set_annotation(node, id);
error:
	ppcg_stmt_free(stmt);
	return isl_ast_node_free(node);
}

/* For internal debugging.
 * Print out the code from the given schedule.
 */
void print_code(struct autosa_gen *gen, __isl_take isl_schedule *schedule, const char *output_f)
{
  isl_ast_node *tree;
  isl_printer *p;
  isl_ast_print_options *print_options;
  isl_ctx *ctx = gen->ctx;
  FILE *f;
  int depth;
  isl_ast_build *build;
  isl_id_list *iterators;
  
  depth = 0;
  if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, &depth) < 0)
		return;
  build = isl_ast_build_alloc(ctx);
  iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
  build = isl_ast_build_set_iterators(build, iterators);
  build = isl_ast_build_set_at_each_domain(build, &at_each_domain, gen->prog->scop);
  tree = isl_ast_build_node_from_schedule(build, schedule);
  isl_ast_build_free(build);

  f = fopen(output_f, "w");
  p = isl_printer_to_file(ctx, f);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_user, NULL);
  p = isl_ast_node_print(tree, p, print_options);

  isl_ast_node_free(tree);
  fclose(f);
  isl_printer_free(p);
}

/* Dump the intermediate code. */
void dump_intermediate_code(
  struct autosa_gen *gen, __isl_take isl_schedule *schedule, const char *stage)
{
  FILE *tmp_f;
  isl_printer *p;
  isl_ast_node *tree = sa_generate_code(gen, schedule);
  
  p = isl_printer_to_str(gen->ctx);
  p = isl_printer_print_str(p, gen->options->autosa->output_dir);
  p = isl_printer_print_str(p, "/src/tmp.");
  p = isl_printer_print_str(p, stage);
  p = isl_printer_print_str(p, ".cpp");
  char *f_path = isl_printer_get_str(p)        ;
  isl_printer_free(p);
  tmp_f = fopen(f_path, "w");
  free(f_path);
  p = isl_printer_to_file(gen->ctx, tmp_f);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  isl_ast_print_options *print_options;
  print_options = isl_ast_print_options_alloc(gen->ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_cpu_user, NULL);
  p = isl_ast_node_print(tree, p, print_options);
  p = isl_printer_free(p);
  fclose(tmp_f);
  isl_ast_node_free(tree);  
}

================================================
FILE: src/autosa_codegen.h
================================================
#ifndef _AUTOSA_CODEGEN_H
#define _AUTOSA_CODEGEN_H

#include "print.h"
#include "util.h"

#include "autosa_common.h"

void generate_hw_modules(__isl_take isl_schedule *schedule,
                         struct autosa_gen *gen, struct autosa_kernel *kernel);

__isl_give isl_schedule_node *sa_add_to_from_device(
    __isl_take isl_schedule_node *node, __isl_take isl_union_set *domain,
    __isl_take isl_union_map *prefix, struct autosa_prog *prog);
__isl_give isl_schedule_node *sa_add_init_clear_device(
    __isl_take isl_schedule_node *node, struct autosa_kernel *kernel);
__isl_give isl_schedule_node *sa_add_drain_merge(
    __isl_take isl_schedule_node *node, struct autosa_gen *gen);

__isl_give isl_ast_node *sa_generate_code(struct autosa_gen *gen,
                                          __isl_take isl_schedule *schedule);
isl_stat sa_filter_buffer_io_module_generate_code(struct autosa_gen *gen,
                                                  struct autosa_hw_module *module);
isl_stat sa_module_generate_code(struct autosa_gen *gen,
                                 struct autosa_hw_module *module);
isl_stat sa_top_module_generate_code(struct autosa_gen *gen);
isl_stat sa_drain_merge_generate_code(struct autosa_gen *gen,
                                      struct autosa_drain_merge_func *func);
isl_stat sa_host_serialize_generate_code(struct autosa_gen *gen,
                                         struct autosa_hw_module *module);                                      

int autosa_array_requires_device_allocation(struct autosa_array_info *array);

__isl_give isl_schedule_node *insert_io_group_domain(
  __isl_take isl_schedule_node *node, 
  struct autosa_array_ref_group *group,
  struct autosa_kernel *kernel,
  struct autosa_gen *gen,
  int read);

void print_code(struct autosa_gen *gen, __isl_take isl_schedule *schedule, const char *output_f);
void dump_intermediate_code(
  struct autosa_gen *gen, __isl_take isl_schedule *schedule, const char *stage);

#endif

================================================
FILE: src/autosa_comm.cpp
================================================
/* Define functions for communication management. */

#include <isl/ilp.h>

#include "autosa_schedule_tree.h"
#include "autosa_utils.h"
#include "autosa_print.h"
#include "autosa_codegen.h"
#include "autosa_comm.h"

/* Internal data structure for autosa_group_references.
 */
struct autosa_group_data
{
  struct autosa_gen *gen;
  struct ppcg_scop *scop;
  /* The schedule depth where the kernel launch will be 
   * introduced.
   */
  int kernel_depth;
  /* The schedule depth at which the copying in/from local_memory
   * is computed. The copy operation may then later
   * be hoisted to a higher level.
   */
  int local_depth;
  /* The schedule depth of "pe" mark. */
  int pe_depth;
  isl_schedule *schedule;

  /* All the schedules are formulated in terms of the original statement
   * instances, i.e., those that appear in the domains of the access 
   * relations. 
   */
  /* Contains the kernel_depth dimensions of the host schedule. */
  isl_union_map *host_sched;
  /* Contains the first local_depth dimensions of the kernel schedule. */
  isl_union_map *local_sched;
  /* Contains the first local_depth dimensions of the kernel schedule. */
  isl_union_map *copy_sched;
  /* Contains the first pe_depth dimensions of the kernel schedule. */
  isl_union_map *pe_sched;
  /* A union map representation of the entire kernel schedule. */
  isl_union_map *full_sched;
};

/* Return the prefix schedule at "node" as a relation
 * between domain elements and schedule dimensions after detecting
 * equalities in this relation.
 */
static __isl_give isl_union_map *prefix_with_equalities(
    __isl_keep isl_schedule_node *node)
{
  isl_union_map *schedule;

  schedule = isl_schedule_node_get_prefix_schedule_relation(node);
  /* Simplify. */
  schedule = isl_union_map_detect_equalities(schedule);

  return schedule;
}

/* Expand the domain of the schedule "s" by plugging in
 * the contraction "contraction" and return the result.
 */
static isl_union_map *expand(__isl_take isl_union_map *s,
                             __isl_keep isl_union_pw_multi_aff *contraction)
{
  contraction = isl_union_pw_multi_aff_copy(contraction);
  s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction);
  return s;
}

/* Fill up the groups of array with singleton groups, i.e., one group
 * per reference, initializing all the necessary fields.
 * In particular the access field is initialized to the scheduled
 * access relation of the array reference.
 *
 * Return the number of elements initialized, i.e., the number of
 * active references in the current kernel.
 */
static int populate_array_references_pe(struct autosa_local_array_info *local,
                                        struct autosa_array_ref_group **groups, struct autosa_group_data *data)
{
  int i;
  int j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);

  n = 0;
  for (i = 0; i < local->array->n_ref; ++i)
  {
    isl_union_map *umap;
    isl_map *map;
    struct autosa_array_ref_group *group;
    struct autosa_stmt_access *access = local->array->refs[i];

    map = isl_map_copy(access->access);
    umap = isl_union_map_from_map(map);
    umap = isl_union_map_apply_domain(umap,
                                      isl_union_map_copy(data->pe_sched));

    if (isl_union_map_is_empty(umap))
    {
      isl_union_map_free(umap);
      continue;
    }

    map = isl_map_from_union_map(umap);
    map = isl_map_detect_equalities(map);
    
    group = new autosa_array_ref_group;
    group = autosa_array_ref_group_init(group);
    if (!group)
    {
      isl_map_free(map);
      return -1;
    }
    group->local_array = local;
    group->array = local->array;
    group->access = map;
    group->write = access->write;
    group->exact_write = access->exact_write;
    group->slice = access->n_index < local->array->n_index;
    group->refs = &local->array->refs[i];
    group->n_ref = 1;
    group->io_type = AUTOSA_UNKNOWN_IO;
    group->dir = NULL;
    group->old_dir = NULL;
    group->group_type = AUTOSA_PE_GROUP;
    group->local_tile = NULL;
    group->io_trans = NULL;
    group->io_pe_expr = NULL;
    group->n_io_buffer = 0;
    group->io_buffers = NULL;
    group->copy_schedule = NULL;
    group->pe_tile = NULL;
    group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(local->array->tuning_refs[i]));
    group->tuning_pe_tile = NULL;

    groups[n++] = group;
  }

  return n;
}

/* Combine the given two groups into a single group, containing
 * the references of both groups.
 */
static struct autosa_array_ref_group *join_groups(
    struct autosa_array_ref_group *group1,
    struct autosa_array_ref_group *group2)
{
  int i, j;
  isl_ctx *ctx;
  struct autosa_array_ref_group *group;

  if (!group1 || !group2)
    return NULL;

  ctx = isl_map_get_ctx(group1->access);
  //group = isl_calloc_type(ctx, struct autosa_array_ref_group);
  group = new autosa_array_ref_group;
  group = autosa_array_ref_group_init(group);
  if (!group)
    return NULL;
  group->local_array = group1->local_array;
  group->array = group1->array;
  group->access = isl_map_union(isl_map_copy(group1->access),
                                isl_map_copy(group2->access));
  group->write = group1->write || group2->write;
  group->exact_write = group1->exact_write && group2->exact_write;
  group->slice = group1->slice || group2->slice;
  //group->n_ref = group1->n_ref + group2->n_ref;
  //group->refs = isl_alloc_array(ctx, struct autosa_stmt_access *,
  //                              group->n_ref);
  //if (!group->refs)
  //  return autosa_array_ref_group_free(group);  
  group->n_ref = group1->n_ref;
  group->refs = isl_alloc_array(ctx, struct autosa_stmt_access *,
                                group->n_ref);
  if (!group->refs)                                     
    return autosa_array_ref_group_free(group);
  for (i = 0; i < group1->n_ref; ++i) {
    group->refs[i] = group1->refs[i];
    group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(group1->tuning_refs[i]));
  }
  /* Compare if the refs equals */      
  for (i = 0; i < group2->n_ref; ++i) {
    struct autosa_stmt_access *ref = group2->refs[i];
    bool found = false;
    for (j = 0; j < group1->n_ref; j++) {
      if (isl_map_is_equal(ref->tagged_access, group1->refs[j]->tagged_access)) {
        found = true;
        break;
      }
    }
    if (!found) {
      group->n_ref++;
      group->refs = (struct autosa_stmt_access **)realloc(group->refs,
                        group->n_ref * sizeof(struct autosa_stmt_access *));      
      group->refs[group->n_ref - 1] = group2->refs[i];
      group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(group2->tuning_refs[i]));
    }
  }

  group->io_type = group1->io_type;
  group->dir = isl_vec_copy(group1->dir);
  group->group_type = group1->group_type;
  group->pe_io_dir = group1->pe_io_dir;
  group->array_io_dir = group1->array_io_dir;
  group->io_trans = group1->io_trans;
  group->io_pe_expr = group1->io_pe_expr;
  group->io_L1_pe_expr = group1->io_L1_pe_expr;
  group->n_io_buffer = group1->n_io_buffer;
  group->io_buffers = group1->io_buffers;
  group->n_mem_ports = group1->n_mem_ports;
  group->local_tile = NULL;
  group->pe_tile = NULL;
  /* Merge the tuning refs */
  for (auto ref : group1->tuning_refs) {
    group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(ref));
  }

  return group;
}

/* Combine the given two groups into a single group and free
 * the original two groups.
 */
static struct autosa_array_ref_group *join_groups_and_free(
    struct autosa_array_ref_group *group1,
    struct autosa_array_ref_group *group2)
{
  struct autosa_array_ref_group *group;

  group = join_groups(group1, group2);  
  autosa_array_ref_group_free(group1);
  autosa_array_ref_group_free(group2);
  return group;
}

static void set_array_groups_default(struct autosa_local_array_info *array,
                                     int n, struct autosa_array_ref_group **groups)
{
  int i;

  array->n_group = n;
  array->groups = groups;

  for (i = 0; i < n; ++i)
    groups[i]->nr = i;
}

/* Default grouping. Simply group all array references together
 * if any of them is associated with RAW/RAR carried by space loops.
 */
static int group_array_references_default(struct autosa_kernel *kernel,
                                          struct autosa_local_array_info *local, struct autosa_group_data *data)
{
  int i, j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);
  struct autosa_array_ref_group **groups;
  int merge_all = 0;
  isl_schedule_node *node;

  groups = isl_calloc_array(ctx, struct autosa_array_ref_group *,
                            local->array->n_ref);  
  if (!groups)
    return -1;

  n = populate_array_references_pe(local, groups, data);

  /* Examine if any of the array references is associated with RAW or
   * RAR carried at space loops. If then, merge all the groups. 
   */
  for (int i = 0; i < n; ++i)
  {
    struct autosa_array_ref_group *group_i = groups[i];
    for (int j = 0; j < group_i->n_ref; ++j)
    {
      struct autosa_stmt_access *ref_i = group_i->refs[j];
      for (int k = 0; k < ref_i->n_io_info; ++k)
      {
        if (ref_i->io_info[k]->dep->type == AUTOSA_DEP_RAW)
        {
          merge_all = 1;
          break;
        }
      }
    }
  }

  if (merge_all)
  {
    /* Join all referneces together. */
    for (int i = 1; i < n; ++i)
    {      
      groups[0] = join_groups_and_free(groups[0], groups[i]);
    }
    n = 1;
  }

  set_array_groups_default(local, n, groups);

  return 0;
}

/* Return the union of all read (read = 1) and/or write (write = 1)
 * access relations in the group.
 */
__isl_give isl_union_map *autosa_array_ref_group_access_relation(
    struct autosa_array_ref_group *group, int read, int write)
{
  int i;
  isl_union_map *access;

  access = isl_union_map_empty(isl_map_get_space(group->access));
  for (i = 0; i < group->n_ref; ++i)
  {
    isl_map *map_i;

    if (!((read && group->refs[i]->read) ||
          (write && group->refs[i]->write)))
      continue;
    map_i = isl_map_copy(group->refs[i]->access);
    access = isl_union_map_union(access,
                                 isl_union_map_from_map(map_i));
  }

  return access;
}

/* Map the domain of "access" to the outer data->pe_depth
 * schedule dimensions.   
 */
static __isl_give isl_map *local_access_pe(struct autosa_array_ref_group *group,
                                           __isl_keep isl_union_map *access, struct autosa_group_data *data)
{
  isl_union_map *local;

  local = isl_union_map_copy(access);
  /* Group at the PE level. */
  local = isl_union_map_apply_domain(local,
                                     isl_union_map_copy(data->pe_sched));
  return isl_map_from_union_map(local);
}

/* Given an array access "access", check if for any index i there is
 * a shift a(p) and a stride g such that
 *
 *	a(p) + i = 0 mod g
 *
 * If so, record the information in tile->bound[i]->stride and
 * tile->bound[i]->shift.
 * Otherwise, set tile->bound[i]->stride to 1 (and tile->bound[i]->shift to 0).
 * Return isl_bool_true if any non-trivial stride was found.
 *
 * Note that the stride info returned by isl_map_get_range_stride_info
 * is of the form
 *
 *	i = o(p) + g n
 *
 * a(p) can therefore be taken to be equal to -o(p).
 */
static isl_bool detect_strides(struct autosa_array_tile *tile,
                               __isl_keep isl_map *access)
{
  int i;
  isl_bool has_strides = isl_bool_false;

  for (i = 0; i < tile->n; ++i)
  {
    struct autosa_array_bound *bound = &tile->bound[i];
    isl_stride_info *si;

    si = isl_map_get_range_stride_info(access, i);
    bound->stride = isl_stride_info_get_stride(si);
    bound->shift = isl_aff_neg(isl_stride_info_get_offset(si));
    isl_stride_info_free(si);

    if (!has_strides)
      has_strides = isl_val_gt_si(bound->stride, 1);
    if (has_strides < 0)
      return isl_bool_error;
  }

  return has_strides;
}

/* Given an array access "access", remove the strides based
 * on the information in tile->bound[i]->stride and tile->bound[i]->shift.
 *
 * In particular let the access be A[a] and
 * let the shifts s_i(p) and the strides g_i be such that
 *
 *  S(p) + a = 0 mod G
 *
 * Replace the access by
 *
 *  A[(a + S(p))/G]
 *
 * First collect the shifts s_i into an isl_multi_aff and
 * the strides into the scaling function A[i] -> A[G i].
 * Then add the shifts to the original access and
 * take the preimage over the scaling.
 */
static __isl_give isl_map *remove_strides(__isl_take isl_map *access,
                                          struct autosa_array_tile *tile)
{
  int i;
  isl_space *space;
  isl_multi_aff *shift, *scale;
  isl_multi_val *stride;

  space = isl_map_get_space(access);
  shift = isl_multi_aff_zero(isl_space_copy(space));
  space = isl_space_range(space);
  stride = isl_multi_val_zero(isl_space_copy(space));
  scale = isl_multi_aff_identity(isl_space_map_from_set(space));
  for (i = 0; i < tile->n; ++i)
  {
    struct autosa_array_bound *bound = &tile->bound[i];
    isl_aff *shift_i;
    isl_val *stride_i;

    shift_i = isl_aff_copy(bound->shift);
    stride_i = isl_val_copy(bound->stride);
    shift = isl_multi_aff_set_aff(shift, i, shift_i);
    stride = isl_multi_val_set_val(stride, i, stride_i);
  }
  scale = isl_multi_aff_scale_multi_val(scale, stride);

  access = isl_map_sum(access, isl_map_from_multi_aff(shift));
  access = isl_map_preimage_range_multi_aff(access, scale);

  return access;
}

/* Check if we can find a memory tile for the given array
 * based on the given accesses, and if so, put the results in "tile".
 *
 * We project the accesses on each index in turn and look for a parametric
 * offset such that the size is constant, after removing
 * any stride that may appear in the accesses.
 *
 * tile->depth is initialized to the input dimension of the computed bounds.
 */
isl_bool can_tile(__isl_keep isl_map *access,
                  struct autosa_array_tile *tile)
{
  int i;
  isl_bool has_strides, valid;
  isl_fixed_box *box;
  isl_multi_aff *offset;
  isl_multi_val *size;

  if (!tile)
    return isl_bool_error;

  isl_map_free(isl_map_detect_equalities(isl_map_copy(access)));

  has_strides = detect_strides(tile, access);
  if (has_strides < 0)
    return isl_bool_error;

  tile->depth = isl_map_dim(access, isl_dim_in);

  access = isl_map_copy(access);
  if (has_strides)
    access = remove_strides(access, tile);

  box = isl_map_get_range_simple_fixed_box_hull(access);
  isl_map_free(access);

  valid = isl_fixed_box_is_valid(box);
  if (valid >= 0 && valid)
  {
    offset = isl_fixed_box_get_offset(box);
    size = isl_fixed_box_get_size(box);
    for (i = 0; i < tile->n; ++i)
    {
      tile->bound[i].size = isl_multi_val_get_val(size, i);
      tile->bound[i].lb = isl_multi_aff_get_aff(offset, i);
    }
    isl_multi_aff_free(offset);
    isl_multi_val_free(size);
  }
  isl_fixed_box_free(box);

  return valid;
}

struct check_contraction_data {
  bool legal;
  struct autosa_array_ref_group *group;
  struct autosa_kernel *kernel;
  isl_union_map *prefix;
  isl_union_pw_multi_aff *prefix_upma;
  int depth;
};

struct check_stmt_contain_acc_data {
  struct autosa_kernel *kernel;
  struct autosa_array_ref_group *group;
};

/* Test if the current statement with the domain "set" contains the array access
 * in the current array group. 
 */
static isl_bool check_stmt_contain_acc(__isl_keep isl_set *set, void *user)
{
  isl_space *space;
  isl_id *id;
  struct autosa_stmt *stmt;
  struct check_stmt_contain_acc_data *data = (struct check_stmt_contain_acc_data *)user;
  struct autosa_stmt_access *accesses, *access;

  space = isl_set_get_space(set);
  id = isl_space_get_tuple_id(space, isl_dim_set);
  isl_space_free(space);
  stmt = find_stmt(data->kernel->prog, id);
  isl_id_free(id);
  accesses = stmt->accesses;

  for (access = accesses; access; access = access->next)
  {
    //if (access == data->group->refs[0])
    //{
    //  return isl_bool_false;
    //}
    for (int i = 0; i < data->group->n_ref; i++) {
      if (access == data->group->refs[i])
        return isl_bool_false;
    }
  }

  return isl_bool_true;
}

/* Check if the pe_group is mapped to a single register.
 * Specifically, check for each array access in the current pe_group, 
 * if all the loops above the array access and below the PE mark are
 * parallel loops.
 */
static __isl_give isl_schedule_node *check_contraction(
  __isl_take isl_schedule_node *node, void *user)
{
  struct check_contraction_data *data = (struct check_contraction_data *)user;
  isl_union_set *domain;
  isl_bool not_contain_acc;
  struct check_stmt_contain_acc_data check_data;
  isl_schedule_node *tmp_node;
  isl_ctx *ctx = isl_schedule_node_get_ctx(node);

  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;

  if (!data->legal)
    return node;

  /* Test if the statement contains the access from the group. */
  domain = isl_schedule_node_get_domain(node);
  check_data.kernel = data->kernel;
  check_data.group = data->group;
  not_contain_acc = isl_union_set_every_set(domain, &check_stmt_contain_acc, &check_data);
  isl_union_set_free(domain);  

  /* Then check if all the loops above the statement until the PE mark are parallel loops. */
  tmp_node = isl_schedule_node_copy(node);
  if (!not_contain_acc) {    
    isl_schedule_node *tmp_node2;

    /* If the node is under SIMD, we will move up to the "SIMD" mark, and 
     * compute the tiling at this level.
     */
    int is_simd;
    is_simd = is_node_under_simd(tmp_node);
    if (is_simd) {
      tmp_node = autosa_tree_move_up_to_mark(tmp_node, "simd");      
    }

    tmp_node2 = isl_schedule_node_copy(tmp_node);

    /* Check if all band nodes above are parallel loops. */    
    while (!(autosa_tree_node_is_mark(tmp_node, "pe"))) {    
      if (isl_schedule_node_get_type(tmp_node) == isl_schedule_node_band) {
        int dim = isl_schedule_node_band_n_member(tmp_node);
        for (int i = 0; i < dim; i++) {
          if (!isl_schedule_node_band_member_get_coincident(tmp_node, i)) {
            data->legal = false;
            break;
          }
        }
      }
      tmp_node = isl_schedule_node_parent(tmp_node);
    }

    if (data->prefix == NULL) {
      data->prefix = isl_schedule_node_get_prefix_schedule_union_map(tmp_node2);
      data->prefix_upma = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(tmp_node2);
      data->depth = isl_schedule_node_get_schedule_depth(tmp_node2);
    } else {
      /* Find the depth that shares the same prefix schedule with the current one. */
      /* Lift the node until it reaches a scheduling depth no greater than data->depth. */
      while (isl_schedule_node_get_schedule_depth(tmp_node2) > data->depth)
        tmp_node2 = isl_schedule_node_parent(tmp_node2);
      if (isl_schedule_node_get_schedule_depth(tmp_node2) < data->depth) {
        /* Lower the node until the scheduling depth equals to the data->depth */                  
        tmp_node2 = isl_schedule_node_band_split(tmp_node2, 
                      data->depth - isl_schedule_node_get_schedule_depth(tmp_node2));
        tmp_node2 = isl_schedule_node_child(tmp_node2, 0);
      }

      /* Lift the node until it achieves the same prefix schedule with the data->prefix. */
      isl_union_map *tmp_prefix = isl_schedule_node_get_prefix_schedule_union_map(tmp_node2);
      int tmp_depth = isl_schedule_node_get_schedule_depth(tmp_node2);      
      isl_set *tmp_prefix_range = isl_set_from_union_set(isl_union_map_range(tmp_prefix));
      isl_set *prefix_range = isl_set_from_union_set(isl_union_map_range(isl_union_map_copy(data->prefix)));
      
      //DBGUSET(stdout, prefix_range, ctx);

      int common_depth = 0;
      for (common_depth = 0; common_depth < tmp_depth; common_depth++) {
        isl_set *tmp_range = isl_set_project_out(isl_set_copy(tmp_prefix_range), isl_dim_set, common_depth, tmp_depth - common_depth);
        isl_set *range = isl_set_project_out(isl_set_copy(prefix_range), isl_dim_set, common_depth, tmp_depth - common_depth);
        isl_set *diff = isl_set_subtract(tmp_range, range);
        if (!isl_set_is_empty(diff)) {
          common_depth--;
          isl_set_free(diff);
          break;
        }
        isl_set_free(diff);
      }
      isl_set_free(tmp_prefix_range);
      isl_set_free(prefix_range);

      /* Lift the node until if reaches common_depth */
      while (isl_schedule_node_get_schedule_depth(tmp_node2) > common_depth) {
        tmp_node2 = isl_schedule_node_parent(tmp_node2);
      }
      if (isl_schedule_node_get_schedule_depth(tmp_node2) < common_depth) {
        tmp_node2 = isl_schedule_node_band_split(tmp_node2, 
                      common_depth - isl_schedule_node_get_schedule_depth(tmp_node2));
        tmp_node2 = isl_schedule_node_child(tmp_node2, 0);
      }
 
      /* Update the scheduling information */      
      isl_union_map_free(data->prefix);
      isl_union_pw_multi_aff_free(data->prefix_upma);
      data->prefix = isl_schedule_node_get_prefix_schedule_union_map(tmp_node2);
      data->prefix_upma = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(tmp_node2);
      data->depth = isl_schedule_node_get_schedule_depth(tmp_node2);
    }    
    isl_schedule_node_free(tmp_node2);
  }
  isl_schedule_node_free(tmp_node);

  return node;
}

/* Compute the tiling of the group at the PE level.
 * If array_contraction is enabled, check if all loops under the PE mark
 * and before the SIMD marks are parallel loops. 
 * If so, contract the local tile to a single register.
 */
static isl_stat compute_group_bounds_core_pe(struct autosa_kernel *kernel,
                                             struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  access = autosa_array_ref_group_access_relation(group, 1, 1);
  /* Create local tile */
  if (use_local)
  {
    struct check_contraction_data contract_data;
    isl_schedule_node *node;        
    contract_data.legal = false;
    contract_data.prefix = NULL;
    contract_data.prefix_upma = NULL;

    /* Create a tile. */
    group->local_tile = autosa_array_tile_create(ctx,
                                                 group->array->n_index);

    /* Check if array contraction is possible. */
    if ((kernel->options->autosa->local_reduce && kernel->options->autosa->array_contraction) ||
       (kernel->options->autosa->tuning_method == 1 && kernel->options->autosa->array_contraction)) {      
      contract_data.group = group;
      contract_data.kernel = kernel;
      contract_data.legal = true;
      contract_data.prefix = NULL;
      contract_data.prefix_upma = NULL;
      contract_data.depth = -1;      
      node = isl_schedule_get_root(kernel->schedule);
      node = autosa_tree_move_down_to_pe(node, kernel->core);      
      node = isl_schedule_node_map_descendant_bottom_up(node, &check_contraction, &contract_data);
      isl_schedule_node_free(node);      
    }
    
    if (contract_data.legal) {
      /* We are able to create a register tiling. */      
      acc = isl_map_from_union_map(isl_union_map_apply_domain(isl_union_map_copy(access), 
                                                              isl_union_map_copy(contract_data.prefix)));
      group->copy_schedule_dim = contract_data.depth;
      group->copy_schedule = contract_data.prefix_upma;
      group->copy_schedule = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(group->copy_schedule,
                                                                                isl_union_pw_multi_aff_copy(kernel->contraction));
    } else {
      isl_union_pw_multi_aff_free(contract_data.prefix_upma);
      /* Map the domain to the outer scheduling dimensions */
      acc = local_access_pe(group, access, data);  
      node = isl_schedule_get_root(kernel->schedule);
      node = autosa_tree_move_down_to_pe(node, kernel->core);
      if (kernel->options->autosa->tuning_method == 1)
        group->tuning_local_tile = TP_infer_tiled_array(data->gen, kernel, node, group, 1, 1);
      isl_schedule_node_free(node);
    }
    if (contract_data.prefix) 
      isl_union_map_free(contract_data.prefix);

    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->local_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->local_tile =
          autosa_array_tile_free(group->local_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Internal struct for compute_group_bounds_core_pe_acc. */
struct compute_local_tile_acc_data
{
  struct autosa_kernel *kernel;
  struct autosa_array_ref_group *group;
  int depth;
  isl_union_map *prefix;
  isl_union_pw_multi_aff *prefix_upma;
  int status;
};

/* Examine the schedule depth and prefix schedule used to calculated the 
 * register tiling. Specifically, if the access is under the SIMD loop,
 * we will move up to the "SIMD" mark and compute tiling at this level.
 * Otherwise, we will compute the tiling at the statement level.
 * In addition, if the access is found in more than one loop, we will 
 * not create register tiling. Instead, we create a local buffer at the PE level.
 */
static __isl_give isl_schedule_node *compute_local_tile_acc(
    __isl_take isl_schedule_node *node, void *user)
{
  struct compute_local_tile_acc_data *data = (struct compute_local_tile_acc_data *)user;
  struct autosa_array_ref_group *group = data->group;
  struct autosa_stmt_access *acc = group->refs[0];
  isl_union_set *domain;
  isl_union_map *prefix;
  isl_union_pw_multi_aff *prefix_upma;
  isl_bool not_contain_acc;
  int depth;
  struct check_stmt_contain_acc_data check_data;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return node;

  /* Test if the statement contains the access. */
  domain = isl_schedule_node_get_domain(node);
  check_data.kernel = data->kernel;
  check_data.group = data->group;
  not_contain_acc = isl_union_set_every_set(domain, &check_stmt_contain_acc, &check_data);
  isl_union_set_free(domain);

  if (!not_contain_acc)
  {
    int is_simd;
    is_simd = is_node_under_simd(node);
    if (is_simd)
    {
      /* If the node is under SIMD, we will move up to the "SIMD" mark, and 
       * compute the tiling at this level. 
       */
      isl_schedule_node *new_node;

      new_node = isl_schedule_node_copy(node);
      new_node = autosa_tree_move_up_to_mark(new_node, "simd");
      prefix = isl_schedule_node_get_prefix_schedule_union_map(new_node);
      prefix_upma = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(new_node);
      depth = isl_schedule_node_get_schedule_depth(new_node);
      isl_schedule_node_free(new_node);
    }
    else
    {
      prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
      prefix_upma = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
      depth = isl_schedule_node_get_schedule_depth(node);
    }
    if (data->depth == -1)
    {
      data->depth = depth;
      data->prefix = prefix;
      data->prefix_upma = prefix_upma;
      data->status = 1;
    }
    else
    {
      /* The array reference is found in more than one loop. 
       * We will compute the tiling at the PE level. 
       */
      isl_union_map_free(prefix);
      isl_union_pw_multi_aff_free(prefix_upma);
      data->status = 0;
    }
  }

  return node;
}

/* Compute the tiling of the group at the statement level.
 */
static isl_stat compute_group_bounds_core_pe_acc(struct autosa_kernel *kernel,
                                                 struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;
  isl_schedule_node *node;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group */
  access = autosa_array_ref_group_access_relation(group, 1, 1);
  /* Create local tile */
  if (use_local)
  {
    struct compute_local_tile_acc_data tile_data;

    tile_data.kernel = kernel;
    tile_data.group = group;
    tile_data.status = 0;
    tile_data.depth = -1;
    tile_data.prefix = NULL;
    /* Create a tile. */
    group->local_tile = autosa_array_tile_create(ctx, group->array->n_index);
    /* Map the domain to the outer scheduling dimensions */
    node = isl_schedule_get_root(kernel->schedule);
    node = autosa_tree_move_down_to_pe(node, kernel->core);
    node = isl_schedule_node_map_descendant_bottom_up(node, &compute_local_tile_acc, &tile_data);
    isl_schedule_node_free(node);
    if (tile_data.status)
    {
      /* We are able to create a register tiling. */
      acc = isl_map_from_union_map(isl_union_map_apply_domain(isl_union_map_copy(access),
                                                              tile_data.prefix));
      /* Update the copy schedule. */
      group->copy_schedule_dim = tile_data.depth;
      group->copy_schedule = tile_data.prefix_upma;
      group->copy_schedule = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(group->copy_schedule,
                                                                                isl_union_pw_multi_aff_copy(kernel->contraction));
    }
    else
    {
      /* We will create the tiling at the PE level. */
      acc = local_access_pe(group, access, data);
      /* Update the copy schedule */
      node = isl_schedule_get_root(kernel->schedule);
      node = autosa_tree_move_down_to_pe(node, kernel->core);
      group->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
      group->copy_schedule =
          isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
      group->copy_schedule = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
          group->copy_schedule, isl_union_pw_multi_aff_copy(kernel->contraction));
      isl_schedule_node_free(node);
    }
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->local_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->local_tile = autosa_array_tile_free(group->local_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Compute the local memory tiles for the array
 * reference group "group" of array "array" and set the tile depth.
 * Return 0 on success and -1 on error.
 */
static int compute_group_bounds_pe(struct autosa_kernel *kernel,
                                   struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  if (!group)
    return -1;
  if (compute_group_bounds_core_pe(kernel, group, data) < 0)
    return -1;

  return 0;
}

/* Compute the register tiles for the array
 * reference group "group" of array "array" and set the tile depth.
 * Return 0 on success and -1 on error.
 */
static int compute_group_bounds_pe_acc(struct autosa_kernel *kernel,
                                       struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  if (!group)
    return -1;
  if (compute_group_bounds_core_pe_acc(kernel, group, data) < 0)
    return -1;

  return 0;
}

/* Set array->n_group and array->groups to n and groups.
 *
 * Additionally, set the "nr" field of each group.
 */
static void set_array_groups_pe(struct autosa_local_array_info *array,
                                int n, struct autosa_array_ref_group **groups)
{
  int i;

  array->n_pe_group = n;
  array->pe_groups = groups;

  for (i = 0; i < n; ++i)
    groups[i]->nr = i;
}

/* Populate the array reference groups with single array reference.
 * If any of the array reference is associated with RAW, the array reference
 * is from an internal array, we will merge all the array references into 
 * one single group.
 * Otherwise, the array reference is from an external array, we do nothing
 * here. 
 * For internal array, we compute the group tiling at the PE level.
 * For external array, we compute the group tiling at the statement level.
 * Return -1 on error.
 */
static int group_array_references_pe(struct autosa_kernel *kernel,
                                     struct autosa_local_array_info *local, struct autosa_group_data *data)
{
  int i, j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);
  struct autosa_array_ref_group **groups;
  int merge_all = 0;
  isl_schedule_node *node;

  groups = isl_calloc_array(ctx, struct autosa_array_ref_group *,
                            local->array->n_ref);
  if (!groups)
    return -1;

  n = populate_array_references_pe(local, groups, data);

  /* Examine if any of the array references is associated with RAW. 
   * If then, merge all the groups. 
   */
  for (int i = 0; i < n; ++i)
  {
    struct autosa_array_ref_group *group_i = groups[i];
    for (int j = 0; j < group_i->n_ref; ++j)
    {
      struct autosa_stmt_access *ref_i = group_i->refs[j];
      for (int k = 0; k < ref_i->n_io_info; ++k)
      {
        if (ref_i->io_info[k]->dep->type == AUTOSA_DEP_RAW)
        {
          merge_all = 1;
          break;
        }
      }
    }
  }  

  if (merge_all)
  {
    /* Join all referneces together. */
    for (int i = 1; i < n; ++i)
    {
      groups[0] = join_groups_and_free(groups[0], groups[i]);
    }
    n = 1;
  }

  if (merge_all)
  {
    /* Internal array. */
    for (i = 0; i < n; ++i)
    {
      if (compute_group_bounds_pe(kernel, groups[i], data) < 0)
      {
        for (j = 0; j < n; j++)
        {
          autosa_array_ref_group_free(groups[j]);
        }
        free(groups);
        return -1;
      }

      if (groups[i]->copy_schedule_dim == 0) {
        /* Update the copy schedule at the PE level */
        node = isl_schedule_get_root(kernel->schedule);
        node = autosa_tree_move_down_to_pe(node, kernel->core);
        groups[i]->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
        groups[i]->copy_schedule =
            isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
        groups[i]->copy_schedule =
            isl_union_pw_multi_aff_pullback_union_pw_multi_aff(groups[i]->copy_schedule,
                                                               isl_union_pw_multi_aff_copy(kernel->contraction));
        isl_schedule_node_free(node);
      }
    }
  }
  else
  {
    /* External array. 
     * We will build the tiling for each array access. */
    for (i = 0; i < n; ++i)
    {
      if (compute_group_bounds_pe_acc(kernel, groups[i], data) < 0)
      {
        for (j = 0; j < n; j++)
        {
          autosa_array_ref_group_free(groups[j]);
        }
        free(groups);
        return -1;
      }
    }
  }

  set_array_groups_pe(local, n, groups);

  return 0;
}

/* Fill up the groups array with singleton groups, i.e., one group
 * per reference, initializing the array, access, write, n_ref and refs fields.
 * In particular the access field is initialized to the scheduled
 * access relation of the array reference.
 *
 * Return the number of elements initialized, i.e., the number of
 * active references in the current kernel.
 */
static int populate_array_references_io(struct autosa_local_array_info *local,
                                        struct autosa_array_ref_group **groups, struct autosa_group_data *data)
{
  int i;
  int j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);

  n = 0;
  for (i = 0; i < local->array->n_ref; ++i)
  {
    for (j = 0; j < local->array->refs[i]->n_io_info; ++j)
    {
      if (!((local->array->refs[i]->io_info[j]->dep->type == AUTOSA_DEP_RAR) ||
         (local->array->refs[i]->io_info[j]->dep->type == AUTOSA_DEP_RAW)))
         continue;

      isl_union_map *umap;
      isl_map *map;
      struct autosa_array_ref_group *group;
      struct autosa_stmt_access *access = local->array->refs[i];

      map = isl_map_copy(access->access);
      umap = isl_union_map_from_map(map);
      umap = isl_union_map_apply_domain(umap,
                                        isl_union_map_copy(data->copy_sched));

      if (isl_union_map_is_empty(umap))
      {
        isl_union_map_free(umap);
        continue;
      }

      map = isl_map_from_union_map(umap);
      map = isl_map_detect_equalities(map);

      //group = isl_calloc_type(ctx, struct autosa_array_ref_group);
      group = new autosa_array_ref_group;
      group = autosa_array_ref_group_init(group);
      if (!group)
      {
        isl_map_free(map);
        return -1;
      }
      group->local_array = local;
      group->array = local->array;
      group->access = map; // not used
      group->write = access->write;
      group->exact_write = access->exact_write;
      group->slice = access->n_index < local->array->n_index;
      group->refs = &local->array->refs[i];
      group->n_ref = 1;
      group->io_type = access->io_info[j]->io_type;
      group->dir = isl_vec_copy(access->io_info[j]->dir);
      group->old_dir = isl_vec_copy(group->dir);
      group->group_type = AUTOSA_IO_GROUP;
      group->pe_io_dir = IO_NULL;
      group->array_io_dir = IO_NULL;
      group->io_trans = NULL;
      group->io_pe_expr = NULL;
      group->io_L1_pe_expr = NULL;
      group->n_io_buffer = 0;
      group->io_buffers = NULL;
      group->copy_schedule = NULL;
      group->pe_tile = NULL;
      group->n_mem_ports = 1;
      group->local_tile = NULL;
      //std::cout << local->array->tuning_refs[i]->to_str() << std::endl;
      group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(local->array->tuning_refs[i]));
      group->tuning_pe_tile = NULL;

      groups[n++] = group;
    }
  }

  return n;
}

/* Examine if two groups share the same I/O modules:
 * - with the same I/O type
 * - with the same I/O direction
 */
static int share_io(struct autosa_array_ref_group *group1,
                    struct autosa_array_ref_group *group2)
{
  if (group1->io_type != group2->io_type)
    return 0;

  for (int i = 0; i < isl_vec_size(group1->dir); i++)
  {
    if (isl_vec_cmp_element(group1->dir, group2->dir, i))
      return 0;
  }

  return 1;
}

/* If two groups have shared I/O (as determined by
 * the "share" function),
 * then merge the two groups into one.
 * TODO: If "compute_bounds" is set, then call compute_group_bounds
 * on the merged groups.
 *
 * Return the updated number of groups.
 * Return -1 on error.
 */
static int group_io(struct autosa_kernel *kernel,
                    int n, struct autosa_array_ref_group **groups,
                    int (*share)(struct autosa_array_ref_group *group1,
                                 struct autosa_array_ref_group *group2),
                    int compute_bounds,
                    struct autosa_group_data *data)
{
  int i, j;

  for (i = 0; i < n; ++i)
  {
    for (j = n - 1; j > i; --j)
    {
      if (!share(groups[i], groups[j]))
        continue;

      groups[i] = join_groups_and_free(groups[i], groups[j]);
      if (j != n - 1)
        groups[j] = groups[n - 1];
      groups[n - 1] = NULL;
      n--;

      if (!groups[i])
        return -1;
      //			if (compute_bounds &&
      //			    compute_group_bounds_io(kernel, groups[i], data) < 0)
      //				return -1;
    }
  }

  return n;
}

/* If two groups share the same I/O type and I/O direction,
 * merge the two groups into one.
 *
 * Return the updated number of groups.
 */
static int group_share_io(struct autosa_kernel *kernel,
                          int n, struct autosa_array_ref_group **groups,
                          struct autosa_group_data *data)
{
  return group_io(kernel, n, groups, &share_io, 0, data);
}

/* Perform interior I/O elimination.
 * Find the I/O group with interior I/O, and assign new data tranfer direction 
 * at the PE level.
 * At present, we will assign the first dim to 1 by default.
 */
static isl_stat autosa_interior_io_eliminate(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    struct autosa_gen *gen, struct autosa_group_data *data)
{
  if (isl_vec_is_zero(group->dir))
  {
    /* This group will generate interior I/O, which needs to be eliminated. 
     * By default, set the first dim to be 1. 
     * Hack: For LU, we set the the last dim to be 1. 
     * TODO: make it an option.
     */
    if (gen->options->autosa->int_io_dir == 0)
      group->dir = isl_vec_set_element_si(group->dir, 0, 1);
    else
      group->dir = isl_vec_set_element_si(group->dir, isl_vec_size(group->dir) - 1, 1);

    /* Update the array info */
    for (int i = 0; i < group->n_ref; i++)
    {
      struct autosa_stmt_access *ref = group->refs[i];
      for (int j = 0; j < ref->n_io_info; j++)
      {
        struct autosa_io_info *io_info = ref->io_info[j];
        if (io_info->io_type == group->io_type && isl_vec_is_zero(io_info->dir))
        {
          isl_vec_free(io_info->dir);
          io_info->dir = isl_vec_copy(group->dir);
        }
      }
    }
  }
  return isl_stat_ok;
}

/* The "node" points to the current space band.
 * We will cluster it using the direction "dir".
 * Specifically, following the space-time transformation using projection and 
 * scheduling vectors, we assign projection vector d = dir, scheduling vector
 * s = dir.
 * Next, we compose the new transformation matrix:
 * 
 * T = [ P
 *      ---
 *       s ]
 * where PdT = 0.
 * 
 * This new transformation matrix is applied to the space band.
 * We will return the transformaton matrix in "io_trans_mat" and "io_trans_ma".
 */
static __isl_give isl_schedule_node *io_cluster(
    __isl_take isl_schedule_node *node,
    __isl_keep isl_vec *dir, isl_mat **io_trans_mat, isl_multi_aff **io_trans_ma)
{
  isl_multi_union_pw_aff *mupa;
  isl_mat *trans_mat, *d_mat, *null_mat;
  int space_dim;
  isl_ctx *ctx;
  isl_space *space;
  isl_multi_aff *ma;
  std::vector<TPIterator *> iters;

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  space_dim = isl_schedule_node_band_n_member(node);
  ctx = isl_schedule_node_get_ctx(node);

  /* Store the tuning iters */
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) {
    iters.push_back((TPIterator *)isl_schedule_node_band_member_get_iter(node, i));
    //std::cout << "io cluster: " << iters[iters.size() - 1]->name << ", " << 
    //    iters[iters.size() - 1]->space_time << std::endl;
  }

  /* Build the transformation matrix. */
  trans_mat = isl_mat_alloc(ctx, space_dim, space_dim);
  d_mat = isl_mat_alloc(ctx, 1, space_dim);
  for (int i = 0; i < isl_vec_size(dir); i++)
  {
    d_mat = isl_mat_set_element_val(d_mat, 0, i,
                                    isl_vec_get_element_val(dir, i));
  }
  null_mat = isl_mat_right_kernel(d_mat);  

  for (int i = 0; i < isl_mat_cols(null_mat); i++)
    for (int j = 0; j < isl_mat_rows(null_mat); j++)
    {
      trans_mat = isl_mat_set_element_val(trans_mat, i, j,
                                          isl_mat_get_element_val(null_mat, j, i));
    }
  for (int i = 0; i < isl_vec_size(dir); i++)
  {
    trans_mat = isl_mat_set_element_val(trans_mat, isl_mat_cols(null_mat), i,
                                        isl_vec_get_element_val(dir, i));
  }
  *io_trans_mat = trans_mat;

  /* Convert the transformation matrix to multi_aff. */
  space = isl_multi_union_pw_aff_get_space(mupa);
  space = isl_space_map_from_set(space);
  ma = isl_multi_aff_identity(space);

  for (int i = 0; i < isl_mat_rows(trans_mat); i++)
  {
    isl_aff *aff = isl_multi_aff_get_aff(ma, i);
    for (int j = 0; j < isl_mat_cols(trans_mat); j++)
    {
      isl_val *val = isl_mat_get_element_val(trans_mat, i, j);      
      aff = isl_aff_set_coefficient_si(aff, isl_dim_in, j, isl_val_get_num_si(val));      
      isl_val_free(val);
    }
    ma = isl_multi_aff_set_aff(ma, i, aff);
  }

  /* Apply the new transformation on the original partial schedule. */
  mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, isl_multi_aff_copy(ma));
  *io_trans_ma = ma;

  node = isl_schedule_node_delete(node);
  /* Insert the new partial schedule. */
  node = isl_schedule_node_insert_partial_schedule(node, mupa);
  /* Add back the tuning iterators.
   * Since all the io dirs are unit vectors, which means only loop permutation is 
   * allowed, we simply swap the iter infos.
   */
  std::vector<int> swap_index;  
  for (int i = 0; i < isl_mat_rows(*io_trans_mat); i++) {
    int tmp = 0;
    for (int j = 0; j < isl_mat_cols(*io_trans_mat); j++) {
      isl_val *val_tmp = isl_mat_get_element_val(*io_trans_mat, i, j);
      tmp += isl_val_get_num_si(val_tmp) * j;
      isl_val_free(val_tmp);
    }
    swap_index.push_back(tmp);
  }
  // Restore the loop iterators
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) {
    //std::cout << "swapped iter: " << iters[swap_index[i]]->name << std::endl;
    node = isl_schedule_node_band_member_set_iter(node, i, iters[swap_index[i]]);
  }

  isl_mat_free(null_mat);

  return node;
}

static isl_stat extract_set_max_dim(__isl_take isl_basic_set *bset, void *user)
{
  isl_val *val;
  isl_val **max_val = (isl_val **)user;

  val = isl_basic_set_dim_max_val(bset, 0);
  if (isl_val_gt(val, *max_val))
  {
    isl_val_free(*max_val);
    *max_val = val;
  }
  else
  {
    isl_val_free(val);
  }

  return isl_stat_ok;
}

/* Insert the global context for introducing the IO module identifiers. 
 * The "node" points to the "kernel" mark.
 * Return the node at the same position.
 */
static __isl_give isl_schedule_node *insert_io_module_context(
  __isl_take isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_gen *gen, struct autosa_kernel *kernel)
{
  int n_io_ids;
  isl_id_list *io_ids;
  isl_set *context;

  n_io_ids = group->space_dim;
  if (n_io_ids <= 0)
    return node;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  n_io_ids = 0;

  /* Update the context. */
  context = isl_set_universe(isl_set_get_space(kernel->context));
  node = autosa_tree_move_down_to_array(node, kernel->core);

  while (!isl_schedule_node_is_io_mark(node, 1))
  {
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      isl_union_map *umap;
      isl_union_set *uset;
      isl_multi_pw_aff *size;
      isl_id *id;
      isl_id_list *ids;
      isl_union_set *domain;
      isl_union_pw_multi_aff *contraction;

      umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
      domain = isl_schedule_node_get_domain(node);
      contraction = isl_schedule_node_get_subtree_contraction(node);
      domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction);
      umap = isl_union_map_intersect_domain(umap, domain);
      uset = isl_union_map_range(umap);
      size = ppcg_size_from_extent(isl_set_from_union_set(uset));
      ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, n_io_ids));
      n_io_ids++;
      context = add_bounded_parameters_dynamic(context, size, ids);
      isl_id_list_free(ids);
      isl_multi_pw_aff_free(size);
    }
    node = isl_schedule_node_child(node, 0);
  }
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_context(node, context);
  node = autosa_tree_move_up_to_kernel(node);

  isl_id_list_free(io_ids);

  return node;
}

/* Perform HBM/Multi-port DRAM optimization.
 */
static __isl_give isl_schedule_node *hbm_optimize(
    __isl_take isl_schedule_node *node,
    isl_multi_aff **io_trans_ma,
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    struct autosa_gen *gen)
{
  isl_union_set *uset;
  isl_set *set;
  isl_basic_set *bset;
  isl_union_map *umap;
  isl_val *val;
  isl_ctx *ctx = gen->ctx;
  int tile_len = 1;
  int *tile_size = NULL;
  cJSON *hbm_json, *hbm_mode_json;
  const char *hbm_mode;
  isl_printer *p_str;
  char *module_name;
  int *ubs = NULL;

  /* Parse the tuning configuration. */
  hbm_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "hbm");
  if (!hbm_json)
  {
    /* Default in auto mode. */
    hbm_mode = "auto";
  }
  else
  {
    hbm_mode_json = cJSON_GetObjectItemCaseSensitive(hbm_json, "mode");
    hbm_mode = hbm_mode_json->valuestring;
  }

  ubs = extract_band_upper_bounds(node);
  if (!strcmp(hbm_mode, "auto"))
  {
    /* HBM optimization is set in AUTO mode. 
     * We will pick up the tiling factors by default.
     */
    tile_size = read_default_hbm_tile_sizes(kernel, tile_len);
  }
  else
  {
    /* HBM optimization is set in MANUAL mode. 
     * We will take the user specification to select the HBM factors.
     */
    char *name;
    isl_printer *p_str;
    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "hbm_");
    p_str = autosa_array_ref_group_print_prefix(group, p_str);
    name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);

    tile_size = read_hbm_tile_sizes(kernel, tile_len, name);
    if (!tile_size)
    {
      /* User hasn't specified the tiling factors for HBM optimization yet,
       * we will dump out the number and upper bounds of the last-level IO loops
       * and exit the program.
       */

      FILE *fp;
      char *content;
      cJSON *tuning, *hbm_json, *loops_json;
      isl_printer *p_str;
      char *tuning_path;

      tuning = cJSON_CreateObject();
      hbm_json = cJSON_CreateObject();
      cJSON_AddItemToObject(tuning, name, hbm_json);
      loops_json = cJSON_CreateArray();
      cJSON_AddItemToObject(hbm_json, "tilable_loops", loops_json);
      for (int i = 0; i < tile_len; i++)
      {
        cJSON *loop = cJSON_CreateNumber(ubs[i]);
        cJSON_AddItemToArray(loops_json, loop);
      }
      p_str = isl_printer_to_str(ctx);
      p_str = isl_printer_print_str(p_str, kernel->options->autosa->output_dir);
      p_str = isl_printer_print_str(p_str, "/tuning.json");
      tuning_path = isl_printer_get_str(p_str);
      fp = fopen(tuning_path, "w");
      content = cJSON_Print(tuning);
      fprintf(fp, "%s", content);
      cJSON_Delete(tuning);
      isl_printer_free(p_str);
      free(tuning_path);
      free(name);
      free(ubs);
      exit(0);
    }
    free(name);
  }

  p_str = isl_printer_to_str(ctx);
  p_str = autosa_array_ref_group_print_prefix(group, p_str);
  module_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  printf("[AutoSA] #HBM port for %s: %d \n", module_name, tile_size[0]);
  free(module_name);

  /* Check if the tile factor is greater or equal than the loop bound. */
  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  uset = isl_union_map_range(umap);
  set = isl_set_from_union_set(uset);
  val = isl_val_zero(ctx);
  isl_set_foreach_basic_set(set, &extract_set_max_dim, &val);
  isl_set_free(set);
  if (isl_val_get_num_si(val) <= tile_size[0])
  {
    /* The current loop bound is smaller than the tile size, 
     * no need to further tile. 
     */
    // TODO: At present, we require tile factor to be greater than the loop bound.
    // This is due to the reason that we can't handle loop with bound one since
    // such loop will be degenerated. Fix it in the future.
    free(tile_size);
    isl_val_free(val);
    printf("[AutoSA] HBM optimization failed! Please try to use a smaller HBM port number.\n");
    return node;
  }
  isl_val_free(val);

  group->n_mem_ports = tile_size[0];
  group->space_dim++;

  tile_size[0] = ubs[0] / tile_size[0];
  node = autosa_tile_band(node, tile_size);
  node = isl_schedule_node_child(node, 0);

  /* Update the transformation function. */
  isl_aff *aff = isl_multi_aff_get_aff(*io_trans_ma, 0);
  isl_aff *tile_aff, *point_aff;
  tile_aff = isl_aff_scale_down_ui(isl_aff_copy(aff), tile_size[0]);
  tile_aff = isl_aff_floor(tile_aff);
  point_aff = isl_aff_scale_down_ui(isl_aff_copy(aff), tile_size[0]);
  point_aff = isl_aff_floor(point_aff);
  point_aff = isl_aff_scale_val(point_aff, isl_val_int_from_ui(ctx, tile_size[0]));
  point_aff = isl_aff_sub(aff, point_aff);

  isl_aff_list *aff_list = isl_aff_list_from_aff(tile_aff);
  aff_list = isl_aff_list_add(aff_list, point_aff);
  for (int n = 1; n < isl_multi_aff_dim(*io_trans_ma, isl_dim_out); n++)
  {
    aff = isl_multi_aff_get_aff(*io_trans_ma, n);
    aff_list = isl_aff_list_add(aff_list, aff);
  }

  isl_space *space = isl_multi_aff_get_space(*io_trans_ma);
  isl_multi_aff_free(*io_trans_ma);
  space = isl_space_add_dims(space, isl_dim_out, 1);
  *io_trans_ma = isl_multi_aff_from_aff_list(space, aff_list);
  free(tile_size);
  free(ubs);

  return node;
}

/* This function examines if the accessed elements of the I/O group 
 * are fully overlapped at the PE level.
 * We will create a relation "overlap"
 * 
 *  [[D -> R] -> [D' -> R']]
 * 
 * of pairs of domain iterations accessing the reference group and 
 * the domain iterations D' is lexicographically greater than D by one 
 * at the last array_part loop with PE loops equal.
 * 
 * This relation is intersected with all flow dependences to derive the 
 * pairs of iterations that overlapped due to the flow dependence.
 * 
 * Next, we construct a relation "external"
 * that contains pair of iteration domains with flow dependences that 
 * access the elements by the I/O group.
 * 
 * We substract "overlap" from "external". If the diff is null, clearly
 * the accessed elements are overlapped between different array partitions 
 * for one PE, we will return true for this case.
 * Otherwise, we return false.
 */
static isl_bool internal_group_in_out_overlap(
    __isl_keep isl_schedule_node *node,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group, int read)
{
  int empty;
  struct autosa_prog *prog = kernel->prog;
  isl_union_pw_multi_aff *tagger;
  isl_union_map *prefix;
  isl_union_map *access, *tagged;
  isl_union_set *domain;
  isl_set *prefix_range;
  isl_map *lt;
  int n_sched_dim;
  isl_union_map *overlap;
  isl_union_map *external, *universe;
  isl_union_set *access_domain;
  isl_union_set *tag_set;
  isl_map *sched_identity;
  int pe_depth, array_depth;

  node = isl_schedule_node_copy(node);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  array_depth = isl_schedule_node_get_schedule_depth(node);
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  pe_depth = isl_schedule_node_get_schedule_depth(node);
  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  isl_schedule_node_free(node);
  access = autosa_io_group_access_relation(group, kernel, read, !read);
  tagged = group_tagged_access_relation(group);

  /* Remove the local dependency first. */
  access = remove_local_accesses_group_flow(kernel, group, access, prefix, read);

  /* Tagger maps the tagged iteration domain to untagged iteration domain.
   * Iteration domain is tagged to the access function.
   * e.g. [S1[i,j,k] -> _pet_ref_1[]] -> S1[(i),(j),(k)]
   */
  tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
  domain = isl_union_map_domain(isl_union_map_copy(tagged));
  tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
                                                   isl_union_set_copy(domain));
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, tagger);

  prefix_range = isl_set_from_union_set(isl_union_map_range(isl_union_map_copy(prefix)));
  n_sched_dim = isl_set_dim(prefix_range, isl_dim_set);
  sched_identity = isl_set_identity(isl_set_copy(prefix_range));

  lt = isl_map_lex_lt_first(isl_map_get_space(sched_identity), array_depth);
  isl_map_free(sched_identity);

  /* Set the space dims equal. */
  for (int i = array_depth; i < n_sched_dim; i++)
  {
    lt = isl_map_equate(lt, isl_dim_in, i, isl_dim_out, i);
  }
  lt = isl_map_intersect_domain(lt, isl_set_copy(prefix_range));
  lt = isl_map_intersect_range(lt, prefix_range);
  lt = isl_map_lexmin(lt);

  overlap = isl_union_map_apply_range(isl_union_map_copy(prefix),
                                      isl_union_map_from_map(lt));
  overlap = isl_union_map_apply_range(overlap, isl_union_map_reverse(prefix));
  overlap = isl_union_map_coalesce(overlap);

  /* Derive the overlapping set. */
  overlap = isl_union_map_intersect(overlap,
                                    isl_union_map_copy(prog->scop->tagged_dep_flow));
  empty = isl_union_map_is_empty(overlap);

  external = isl_union_map_copy(prog->scop->tagged_dep_flow);
  universe = isl_union_map_universe(isl_union_map_copy(access));
  access_domain = isl_union_map_domain(universe);
  domain = isl_union_set_universe(domain);
  universe = isl_union_set_unwrap(domain);
  universe = isl_union_map_intersect_domain(universe, access_domain);
  /* D -> __pet_ref_1 */
  domain = isl_union_map_wrap(universe);
  if (read)
    external = isl_union_map_intersect_range(external, domain);
  else
    external = isl_union_map_intersect_domain(external, domain);
  external = isl_union_map_intersect_params(external,
                                            isl_set_copy(prog->scop->context));
  /* external contains flow dep that are associated with the group access. */

  external = isl_union_map_subtract(external, overlap);
  /* external only contains access non-overlap RAW pairs. */

  if (read)
  {
    tag_set = isl_union_map_range(external);
    external = wrapped_reference_to_access(tag_set, tagged);
  }
  else
  {
    tag_set = isl_union_map_domain(external);
    external = wrapped_reference_to_access(tag_set, tagged);
  }

  if (empty < 0)
    external = isl_union_map_free(external);
  else if (empty)
    external = isl_union_map_universe(external);

  access = isl_union_map_intersect(access, external);
  empty = isl_union_map_is_empty(access);
  isl_union_map_free(access);

  if (empty)
    return isl_bool_true;
  else
    return isl_bool_false;
}

/* This function examines if the dependence in the io group are carried by the 
 * loops above the "array" node. 
 */
static isl_bool io_group_carried_by_array_loops(
    __isl_keep isl_schedule_node *node,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group, int read)
{
  isl_union_map *prefix, *identity_sched;
  isl_union_map *access, *tagged;
  isl_union_pw_multi_aff *tagger;
  isl_union_set *domain, *access_domain;
  struct autosa_prog *prog = kernel->prog;
  isl_set *prefix_range;
  int n_sched_dim;
  isl_map *sched_identity;
  isl_union_map *external, *universe;
  isl_union_set *tag_set;
  int empty;  

  node = isl_schedule_node_copy(node);
  node = autosa_tree_move_down_to_array(node, kernel->core);

  /* Test if the array partition band is empty */
  node = isl_schedule_node_parent(node);
  if (isl_schedule_node_get_type(node) != isl_schedule_node_band) {
    /* No array partitioning, directly return. */
    isl_schedule_node_free(node);
    return isl_bool_false;
  }
  node = autosa_tree_move_down_to_array(node, kernel->core);

  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  isl_schedule_node_free(node);
  access = autosa_io_group_access_relation(group, kernel, read, !read);  
  /* Remove the local dependence first. */
  access = remove_local_accesses_group_flow(kernel, group, access, prefix, read);

  tagged = group_tagged_access_relation(group);
  tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
  domain = isl_union_map_domain(isl_union_map_copy(tagged));
  tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
                                                   isl_union_set_copy(domain));

  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, tagger);  
  identity_sched = isl_union_map_apply_range(prefix, 
                                             isl_union_map_reverse(isl_union_map_copy(prefix)));
  identity_sched = isl_union_map_intersect(identity_sched,
                                           isl_union_map_copy(prog->scop->tagged_dep_flow));
  empty = isl_union_map_is_empty(identity_sched);

  external = isl_union_map_copy(prog->scop->tagged_dep_flow);
  universe = isl_union_map_universe(isl_union_map_copy(access));
  access_domain = isl_union_map_domain(universe);
  domain = isl_union_set_universe(domain);
  universe = isl_union_set_unwrap(domain);
  universe = isl_union_map_intersect_domain(universe, access_domain);
  domain = isl_union_map_wrap(universe);
  if (read)
    external = isl_union_map_intersect_range(external, domain);
  else
    external = isl_union_map_intersect_domain(external, domain);
  external = isl_union_map_intersect_params(external,
                                            isl_set_copy(prog->scop->context));
  external = isl_union_map_subtract(external, identity_sched);

  if (read)
  {
    tag_set = isl_union_map_range(external);
    external = wrapped_reference_to_access(tag_set, tagged);
  }
  else
  {
    tag_set = isl_union_map_domain(external);
    external = wrapped_reference_to_access(tag_set, tagged);
  }

  if (empty < 0)
    external = isl_union_map_free(external);
  else if (empty)
    external = isl_union_map_universe(external);

  access = isl_union_map_intersect(access, external);
  empty = isl_union_map_is_empty(access);
  isl_union_map_free(access);

  if (empty)
    return isl_bool_false;
  else
    return isl_bool_true;   
}

/* Return is the inter PE communication is required for this group.
 * There are several cases to consider:
 * - For I/O group with RAR dependences
 *   - if the group is with exterior I/O, then both in/out PE comm is required.
 *   - if the group is with interior I/O, only in PE comm is required.
 * - For I/O group with RAW deps
 *   - If the group is with exterior I/O, then both in/out PE comm is required.
 *   - If the group is with interior I/O, then it equals the array-level I/O direction. 
 */
static isl_bool is_inter_pe_comm_valid(
    __isl_keep isl_schedule_node *node,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group, int read)
{
  int external_group = 1;

  if (group->group_type == AUTOSA_PE_GROUP)
    return isl_bool_true;
  
  /* External group */
  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    for (int j = 0; j < ref->n_io_info; j++)
    {
      struct autosa_io_info *io_info = ref->io_info[j];
      if (io_info->io_type == group->io_type && !isl_vec_cmp(io_info->dir, group->dir))
      {
        if (io_info->dep->type != AUTOSA_DEP_RAR)
        {
          external_group = 0;
          break;
        }
      }
    }
  }

  if (external_group)
  {
    if (group->io_type == AUTOSA_EXT_IO)      
      return isl_bool_true;
    else {
      if (read)
        return isl_bool_true;
      else
        return isl_bool_false;
    }   
  } else {
    if (group->io_type == AUTOSA_EXT_IO)
      return isl_bool_true;
    else {
      if (read) 
        return (group->array_io_dir == IO_IN || group->array_io_dir == IO_INOUT)? isl_bool_true : isl_bool_false;
      else 
        return (group->array_io_dir == IO_OUT || group->array_io_dir == IO_INOUT)? isl_bool_true : isl_bool_false;
    }
  }

  return isl_bool_true;
}

/* Return if the current module is valid to be generated. 
 * There are several cases to consider:
 * - For I/O group with all RAR depenendence, no copy-out modules to be generated.
 * - For I/O group with RAW dependence.
 *   - If the dep is carried by array loops
 *     - if the group is interior I/O and the next read equals the previous write, no copy-in/copy-out to be generated.
 *   - Else if the dep is not carried by array loops
 *     - no copy-in/copy-out to be generated.
 */
isl_bool is_io_module_valid(
    __isl_keep isl_schedule_node *node,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group, int read)
{
  int external_group = 1;

  if (group->group_type == AUTOSA_PE_GROUP)
    return isl_bool_true;
  if (group->group_type == AUTOSA_DRAIN_GROUP && read)
    return isl_bool_false;
  if (group->attached_drain_group)
    return isl_bool_true;

  /* External group */
  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    for (int j = 0; j < ref->n_io_info; j++)
    {
      struct autosa_io_info *io_info = ref->io_info[j];
      if (io_info->io_type == group->io_type && !isl_vec_cmp(io_info->dir, group->dir))
      {
        if (io_info->dep->type != AUTOSA_DEP_RAR)
        {
          external_group = 0;
          break;
        }
      }
    }
  }

  if (external_group)
  {
    if (read)
      return isl_bool_true;
    else
      return isl_bool_false;
  }

  /* Internal group */
  if (io_group_carried_by_array_loops(node, kernel, group, read)) {
    if (group->io_type == AUTOSA_INT_IO &&
        internal_group_in_out_overlap(node, kernel, group, read))
      return isl_bool_false;
  } else {
    return isl_bool_false;
  }

  return isl_bool_true;
}

/* This function computes the schedule for the I/O modules that transfers
 * the data for the I/O group "group".
 * We will cluster I/O modules level by level. 
 * We will first insert a "IO_L1" mark below the space loops, which indicates
 * IO_L1 modules will be allocated beside each PE.
 * Next, to clulster IO_L1 modules, we look at the space loops above the current
 * mark. We will perform a space-time transformation to cluster the I/O modules.
 * In the current implmentation, we will always use the projection vector (1,X)
 * to project all I/O modules along the direction of (1,X) together, and 
 * schedule them following the direction of (1,X).
 * After one clustering, we will insert a new I/O mark below the new space loops.
 * This is done iteratively untill we run out of the available space loops.
 * The transformed space band will look like:
 * "array" mark
 * |
 * "IO_LX" mark
 * |
 * X 
 * | 
 * "IO_LY" mark
 * |
 * Y
 * |
 * "PE" mark
 */
static isl_stat compute_io_group_schedule(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    struct autosa_gen *gen)
{
  isl_printer *p_str;
  char *io_str;
  int io_level = 0;
  int i;
  isl_ctx *ctx = gen->ctx;
  isl_id *id;
  isl_schedule *sched;
  isl_mat *io_trans_mat = NULL;
  isl_multi_aff *io_trans_ma = NULL;
  isl_map *io_trans_map = NULL;
  isl_schedule_node *node;
  int space_dim;
  isl_schedule *schedule;

  /* Sink to the space band */
  schedule = isl_schedule_dup(kernel->schedule);
  node = isl_schedule_get_root(schedule);
  isl_schedule_free(schedule);

  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_child(node, 0);
  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
  space_dim = isl_schedule_node_band_n_member(node);  
  group->space_dim = space_dim;

  /* Insert the IO_L1 mark. */
  node = isl_schedule_node_child(node, 0);
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, "io_L");
  p_str = isl_printer_print_int(p_str, io_level + 1);
  io_str = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  id = isl_id_alloc(ctx, io_str, NULL);
  free(io_str);
  node = isl_schedule_node_insert_mark(node, id);
  io_level++;
  node = isl_schedule_node_parent(node);

  /* Cluster the I/O modules from innermost space loops to outermost loops. */
  for (int i = space_dim - 1; i >= 0; i--)
  {
    isl_mat *io_trans_mat_i;
    isl_multi_aff *io_trans_ma_i;
    isl_vec *dir;
    isl_mat *mat;

    /* Perform space-time transformation on the current band. */    
    if (i == space_dim - 1)
    {      
      dir = isl_vec_dup(group->dir);
    }
    else
    {
      /* By default, we set the first element of the direction vector as 1. */
      dir = isl_vec_zero(ctx, i + 1);
      dir = isl_vec_set_element_si(dir, 0, 1);
    }
    node = io_cluster(node, dir, &io_trans_mat_i, &io_trans_ma_i);
    isl_vec_free(dir);

    if (io_level == 1)
    {
      sched = isl_schedule_node_get_schedule(node);
      group->io_L1_schedule = isl_schedule_dup(sched);
      // TODO: if the space schedule is to be degenerated, we
      // will need to update the io_trans/io_L1_trans as well.
      group->io_L1_trans = isl_multi_aff_copy(io_trans_ma_i);

      isl_schedule_free(sched);
      io_trans_mat = io_trans_mat_i;
      io_trans_ma = io_trans_ma_i;
    }
    else
    {
      isl_multi_aff_free(io_trans_ma_i);
      /* Update the transformation matrix. */
      int nrow = isl_mat_rows(io_trans_mat);
      int ncol = isl_mat_cols(io_trans_mat);
      isl_mat *extend_mat = isl_mat_alloc(ctx, nrow, ncol);
      isl_mat *product_mat = isl_mat_alloc(ctx, nrow, ncol);
      for (int r = 0; r < nrow; r++)
        for (int c = 0; c < ncol; c++)
        {
          extend_mat = isl_mat_set_element_si(extend_mat, r, c, 0);
          product_mat = isl_mat_set_element_si(product_mat, r, c, 0);
        }

      for (int r = 0; r < isl_mat_rows(io_trans_mat_i); r++)
        for (int c = 0; c < isl_mat_cols(io_trans_mat_i); c++)
        {
          extend_mat = isl_mat_set_element_val(extend_mat, r, c,
                                               isl_mat_get_element_val(io_trans_mat_i, r, c));
        }
      for (int r = isl_mat_rows(io_trans_mat_i); r < nrow; r++)
      {
        extend_mat = isl_mat_set_element_si(extend_mat, r, r, 1);
      }
      for (int r = 0; r < nrow; r++)
        for (int c = 0; c < ncol; c++)
        {
          for (int k = 0; k < nrow; k++)
          {
            isl_val *v1, *v2, *v3;
            v1 = isl_mat_get_element_val(extend_mat, r, k);
            v2 = isl_mat_get_element_val(io_trans_mat, k, c);
            v3 = isl_mat_get_element_val(product_mat, r, c);
            v1 = isl_val_mul(v1, v2);
            v3 = isl_val_add(v1, v3);
            product_mat = isl_mat_set_element_val(product_mat, r, c, v3);
          }
        }
      isl_mat_free(io_trans_mat);
      isl_mat_free(extend_mat);
      isl_mat_free(io_trans_mat_i);
      io_trans_mat = product_mat;

      /* Reset the transformation function. */
      for (int r = 0; r < nrow; r++)
      {
        isl_aff *aff = isl_multi_aff_get_aff(io_trans_ma, r);
        for (int c = 0; c < ncol; c++)
        {
          isl_val *val = isl_mat_get_element_val(io_trans_mat, r, c);          
          aff = isl_aff_set_coefficient_si(aff, isl_dim_in, c, isl_val_get_num_si(val));          
          isl_val_free(val);
        }
        io_trans_ma = isl_multi_aff_set_aff(io_trans_ma, r, aff);
      }
    }

    /* Split the band and insert the IO mark. */
    if (i > 0)
    {
      node = isl_schedule_node_band_split(node, i);
      node = isl_schedule_node_child(node, 0);
    }

    /* If the multi-port DRAM/HBM is to be used, we will need to tile the loop again.
     */
    if (i == 0 && gen->options->autosa->hbm)
    {
      /* Test if this group contains both copy-in and copy-out set. 
       * At present, HBM optimization is not supported for this type of I/O group.
       * We will need to make sure the copy-in and copy-out set for each HBM channel 
       * do not overlap since we only support fixed HBM port mapping for now.
       * Therefore, for this type of I/O group, we will disable the HBM optimization.
       * TODO: Relax this constraint in the future.
       */
      printf("[AutoSA] Apply HBM optimization.\n");
      if (group->group_type == AUTOSA_IO_GROUP &&
          is_flow_dep_carried_by_array_part_loops(kernel->schedule, group, kernel))
      {
        isl_printer *p_str;
        char *module_name;
        p_str = isl_printer_to_str(ctx);
        p_str = autosa_array_ref_group_print_prefix(group, p_str);
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);

        printf("[AutoSA] The flow dependence is carried by the array partitioning loops.\n");
        printf("[AutoSA] HBM optimization for the group: %s is omitted.\n", module_name);
        free(module_name);
        goto next;
      }
      if (group->io_type == AUTOSA_EXT_IO && i == space_dim - 1)
      {
        printf("[AutoSA] HBM optimization failed! Not enough I/O modules.\n");
        goto next;
      }
      node = hbm_optimize(node, &io_trans_ma, kernel, group, gen);
    }
  next:
    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_print_str(p_str, "io_L");
    p_str = isl_printer_print_int(p_str, io_level + 1);
    io_str = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    id = isl_id_alloc(ctx, io_str, NULL);
    free(io_str);
    node = isl_schedule_node_insert_mark(node, id);
    node = isl_schedule_node_parent(node);
    io_level++;
  }

  isl_mat_free(io_trans_mat);  

  group->io_level = io_level;
  group->io_trans = io_trans_ma;

  /* Insert the context node for the IO ids. 
   * NOTE: We will update this again in the later IO module generation.
   */
  node = autosa_tree_move_up_to_kernel(node);
  node = insert_io_module_context(node, group, gen, kernel);

  /* Determine if the I/O module for this group could be eliminated.
   */
  group->copy_in = 0;
  group->copy_out = 0;
  if (is_io_module_valid(node, kernel, group, 1))
  {
    group->copy_in = 1;
    group->array_io_dir = (group->array_io_dir == IO_OUT)? IO_INOUT : IO_IN;
  }
  if (is_io_module_valid(node, kernel, group, 0))
  {
    group->copy_out = 1;
    group->array_io_dir = (group->array_io_dir == IO_IN)? IO_INOUT : IO_OUT;
  }
  /* For drain group, copy-out module is always required. */
  if (group->group_type == AUTOSA_DRAIN_GROUP) {
    group->copy_out = 1;
    group->array_io_dir = (group->array_io_dir == IO_IN)? IO_INOUT : IO_OUT;
  }

  if (group->copy_in || group->copy_out)
  {
    group->mem_port_id = group->local_array->n_mem_ports;
    group->local_array->n_mem_ports += group->n_mem_ports;
  }

  /* Determine if the inter-PE communication is required. */
  if (is_inter_pe_comm_valid(node, kernel, group, 1)) {
    group->pe_io_dir = (group->pe_io_dir == IO_OUT)? IO_INOUT : IO_IN;
  }
  if (is_inter_pe_comm_valid(node, kernel, group, 0)) {
    group->pe_io_dir = (group->pe_io_dir == IO_IN)? IO_INOUT : IO_OUT;
  }
  if (group->group_type == AUTOSA_DRAIN_GROUP) {
    group->pe_io_dir = (group->pe_io_dir == IO_IN)? IO_INOUT : IO_OUT;
  }

  /* Store the I/O schedule. */
  sched = isl_schedule_node_get_schedule(node);
  group->io_schedule = isl_schedule_dup(sched);
  isl_schedule_free(sched);
  isl_schedule_node_free(node);

  return isl_stat_ok;
}

static __isl_give isl_map *local_access_io_at_node(struct autosa_kernel *kernel,
                                                   struct autosa_array_ref_group *group,
                                                   __isl_keep isl_union_map *access, __isl_keep isl_schedule_node *node)
{
  isl_union_map *local, *sched;
  isl_union_pw_multi_aff *contraction;

  local = isl_union_map_copy(access);
  sched = prefix_with_equalities(node);
  // TODO: fix the contraction
  contraction = isl_schedule_node_get_subtree_contraction(node);
  /* #ifdef _DEBUG
  isl_printer *pd = isl_printer_to_file(isl_schedule_node_get_ctx(node), stdout);
  pd = isl_printer_print_union_pw_multi_aff(pd, contraction);
  pd = isl_printer_end_line(pd);
  isl_printer_free(pd);
#endif */

  sched = expand(sched, contraction);
  local = isl_union_map_apply_domain(local, sched);

  isl_union_pw_multi_aff_free(contraction);

  return isl_map_from_union_map(local);
}

/* Compute the local memory tiles for the drain group "group"
 * of array "array". Return isl_stat_ok on success and isl_stat_error on error.
 *
 * If the array is a read-only scalar or if the user requested not to use local
 * memory, then we do not need to do anything.
 */
isl_stat compute_group_bounds_drain_at_node(struct autosa_kernel *kernel,
                                            struct autosa_array_ref_group *group, __isl_keep isl_schedule_node *node,
                                            struct autosa_io_buffer *buffer)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  access = autosa_array_ref_group_access_relation(group, 0, 1);
  /* Create local tile */
  if (use_local)
  {
    /* Create a tile */
    buffer->tile = autosa_array_tile_create(ctx, group->array->n_index);
    /* Map the domain to the outer scheduling dimensions */
    acc = local_access_io_at_node(kernel, group, access, node);
    /* Collect the shift and scale factors of the tile */
    ok = can_tile(acc, buffer->tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      buffer->tile = autosa_array_tile_free(buffer->tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Should this array reference group be mapped to local or global
 * memory?
 * If the array is scalar, we will map it to the global memory.
 * Otherwise, it is mapped to local memory. 
 */
enum autosa_group_access_type autosa_array_ref_group_type(
    struct autosa_array_ref_group *group)
{
  if (autosa_array_is_read_only_scalar(group->array))
    return AUTOSA_ACCESS_GLOBAL;
  else
    return AUTOSA_ACCESS_LOCAL;
}

/* Return the effective array_tile associated to "group" or
 * NULL if there is no such array_tile.
 */
struct autosa_array_tile *autosa_array_ref_group_tile(
    struct autosa_array_ref_group *group)
{
  switch (autosa_array_ref_group_type(group))
  {
  case AUTOSA_ACCESS_GLOBAL:
    return NULL;
  case AUTOSA_ACCESS_LOCAL:
    return group->local_tile;
  }

  return NULL;
}

/* Should this array reference group be mapped to local or global
 * memory?
 */
enum autosa_group_access_type autosa_cpu_array_ref_group_type(
    struct autosa_array_ref_group *group)
{
  if (group->local_tile)
    return AUTOSA_ACCESS_LOCAL;
  return AUTOSA_ACCESS_GLOBAL;
}

/* Given a description of an array tile "tile" and the "space"
 *
 *	{ D -> A }
 *
 * where D represents the first tile->depth schedule dimensions
 * and A represents the array, construct an isl_multi_aff
 *
 *	{ [D[i] -> A[a]] -> A'[a'] }
 *
 * with A' a scaled down copy of A according to the shifts and strides
 * in "tile".  In particular,
 *
 *	a' = (a + shift(i))/stride
 *
 * "insert_array" represents
 *
 *	{ [D -> A] -> D }
 *
 * and is used to insert A into the domain of functions that only
 * reference D.
 */
static __isl_give isl_multi_aff *strided_tile(
    struct autosa_array_tile *tile, __isl_keep isl_space *space,
    __isl_keep isl_multi_aff *insert_array)
{
  int i;
  isl_ctx *ctx;
  isl_multi_aff *shift;
  isl_multi_val *stride;
  isl_space *space2;
  isl_local_space *ls;
  isl_multi_aff *tiling;

  ctx = isl_space_get_ctx(space);
  space2 = isl_space_domain(isl_space_copy(space));
  ls = isl_local_space_from_space(space2);
  space2 = isl_space_range(isl_space_copy(space));
  stride = isl_multi_val_zero(space2);
  shift = isl_multi_aff_zero(isl_space_copy(space));

  for (i = 0; i < tile->n; ++i)
  {
    struct autosa_array_bound *bound = &tile->bound[i];
    isl_val *stride_i;
    isl_aff *shift_i;

    stride_i = isl_val_copy(bound->stride);
    shift_i = isl_aff_copy(bound->shift);

    stride = isl_multi_val_set_val(stride, i, stride_i);
    shift = isl_multi_aff_set_aff(shift, i, shift_i);
  }
  isl_local_space_free(ls);

  shift = isl_multi_aff_pullback_multi_aff(shift,
                                           isl_multi_aff_copy(insert_array));

  tiling = isl_multi_aff_range_map(isl_space_copy(space));
  tiling = isl_multi_aff_add(tiling, shift);
  tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);

  return tiling;
}

/* Print the name of the local copy of a given group of array references.
 */
__isl_give isl_printer *autosa_array_ref_group_print_name(
    struct autosa_array_ref_group *group, __isl_take isl_printer *p)
{
  int global = 0;
  enum autosa_group_access_type type;

  type = autosa_array_ref_group_type(group);
  if (type == AUTOSA_ACCESS_LOCAL)
    p = isl_printer_print_str(p, "local_");
  else
    global = 1;

  p = isl_printer_print_str(p, group->array->name);
  if (!global)
  {
    if (group->group_type == AUTOSA_IO_GROUP && group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
    else if (group->group_type == AUTOSA_PE_GROUP && group->local_array->n_pe_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }

  return p;
}

/* Compute a tiling for the array reference group "group".
 *
 * The tiling is of the form
 *
 *	{ [D[i] -> A[a]] -> T[t] }
 *
 * where D represents the first tile->depth schedule dimensions,
 * A represents the global array and T represents the local memory 
 * tile.  The name of T is the name of the local array.
 *
 * If there is any stride in the accesses, then the mapping is
 *
 *	t = (a + shift(i))/stride - lb(i)
 *
 * otherwise, it is simply
 *
 *	t = a - lb(i)
 *
 * Compute the tiling based on the "tile". If "tile" is NULL, 
 * compute the tiling based on the tile from the "group".
 */
void autosa_array_ref_group_compute_tiling(
    struct autosa_array_tile *tile,
    struct autosa_array_ref_group *group)
{
  int i;
  isl_space *space;
  isl_multi_aff *tiling, *lb, *insert_array;
  isl_printer *p;
  char *local_name;

  if (tile == NULL && autosa_array_ref_group_tile(group) == NULL)
    return;

  if (tile == NULL)
    tile = autosa_array_ref_group_tile(group);

  space = isl_map_get_space(group->access);
  space = isl_space_from_range(isl_space_range(space));
  /* Build D[i] -> A[a] */
  space = isl_space_add_dims(space, isl_dim_in, tile->depth);
  /* Build [D[i] -> A[a]] -> D[i] */
  insert_array = isl_multi_aff_domain_map(isl_space_copy(space));

  for (i = 0; i < tile->n; ++i)
    if (tile->bound[i].shift)
      break;

  if (i < tile->n)
    tiling = strided_tile(tile, space, insert_array);
  else
    tiling = isl_multi_aff_range_map(isl_space_copy(space));

  lb = isl_multi_aff_zero(space);
  for (i = 0; i < tile->n; ++i)
  {
    isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
    lb = isl_multi_aff_set_aff(lb, i, lb_i);
  }
  lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);

  tiling = isl_multi_aff_sub(tiling, lb);

  p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
  p = autosa_array_ref_group_print_name(group, p);
  local_name = isl_printer_get_str(p);
  isl_printer_free(p);
  tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
  free(local_name);

  tile->tiling = tiling;
}

/* Compute the tiling bounds for the drain group at the PE level. 
 */
static isl_stat compute_group_bounds_drain_at_node_PE(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    __isl_keep isl_schedule_node *node)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  access = autosa_array_ref_group_access_relation(group, 0, 1);
  /* Create local tile. */
  if (use_local)
  {
    /* Create a tile. */
    group->pe_tile = autosa_array_tile_create(ctx, group->array->n_index);
    /* Map the domain to the outer scheduling dimensions. */
    acc = local_access_io_at_node(kernel, group, access, node);
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->pe_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->pe_tile = autosa_array_tile_free(group->pe_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Compute the drain group tiling at the PE level. */
static isl_stat compute_drain_tiling_at_PE(struct autosa_kernel *kernel,
                                           struct autosa_array_ref_group *group)
{
  isl_schedule_node *node;
  struct autosa_array_tile *tile;

  node = isl_schedule_get_root(kernel->schedule);
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  compute_group_bounds_drain_at_node_PE(kernel, group, node);
  autosa_array_ref_group_compute_tiling(group->pe_tile, group);
  isl_schedule_node_free(node);

  return isl_stat_ok;
}

/* Compute the local memory tiles for the io group "group"
 * of array "array". Return isl_stat_ok on success and isl_stat_error on error.
 *
 * If the array is a read-only scalar or if the user requested not to use local
 * memory, then we do not need to do anything.
 */
isl_stat compute_group_bounds_io_at_node(struct autosa_kernel *kernel,
                                         struct autosa_array_ref_group *group, __isl_keep isl_schedule_node *node,
                                         struct autosa_io_buffer *buffer)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  access = autosa_array_ref_group_access_relation(group, 1, 1);
  /* Create local tile. */
  if (use_local)
  {
    /* Create a tile. */
    buffer->tile = autosa_array_tile_create(ctx, group->array->n_index);
    /* Map the domain to the outer scheduling dimensions. */
    acc = local_access_io_at_node(kernel, group, access, node);
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, buffer->tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      buffer->tile = autosa_array_tile_free(buffer->tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Compute the tiling group bounds for the io group at the PE level. */
isl_stat compute_group_bounds_io_at_node_PE(
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group, __isl_keep isl_schedule_node *node)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  access = autosa_array_ref_group_access_relation(group, 1, 1);
  /* Create local tile. */
  if (use_local)
  {
    /* Create a tile. */
    group->pe_tile = autosa_array_tile_create(ctx, group->array->n_index);
    /* Map the domain to the outer scheduling dimensions. */
    acc = local_access_io_at_node(kernel, group, access, node);
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->pe_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->pe_tile = autosa_array_tile_free(group->pe_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Create the tiling for the IO group at the PE level. */
static isl_stat compute_io_tiling_at_PE(struct autosa_kernel *kernel,
                                        struct autosa_array_ref_group *group)
{
  isl_schedule_node *node;
  struct autosa_array_tile *tile;

  node = isl_schedule_get_root(kernel->schedule);
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  compute_group_bounds_io_at_node_PE(kernel, group, node);
  autosa_array_ref_group_compute_tiling(group->pe_tile, group);
  isl_schedule_node_free(node);

  return isl_stat_ok;
}

/* Insert the IO module filter ids into the schedule.
 * "node" points to the IO_L[io_level] mark.
 * Return the new node points to the same position.
 */
static __isl_give isl_schedule_node *insert_io_module_ids(
    struct autosa_gen *gen, struct autosa_kernel *kernel,
    __isl_take isl_schedule_node *node, int space_dim, int io_level)
{
  int n_io_ids;
  isl_id_list *io_ids;
  isl_set *context;
  isl_union_set *filter = NULL;

  n_io_ids = space_dim - io_level + 1;
  if (n_io_ids <= 0)
    return node;
  io_ids = ppcg_scop_generate_names(gen->prog->scop, n_io_ids, "p");
  n_io_ids = 0;

  /* Add the filters. */
  n_io_ids = 0;
  node = autosa_tree_move_up_to_array(node);
  while (!isl_schedule_node_is_io_mark(node, io_level))
  {
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      isl_id *id;
      isl_id_list *ids;
      isl_union_set *uset;

      ids = isl_id_list_from_id(isl_id_list_get_id(io_ids, n_io_ids));
      uset = set_schedule_eq(node, ids);
      n_io_ids++;      
      if (filter == NULL)
        filter = uset;
      else
        filter = isl_union_set_union(filter, uset);      
      //node = isl_schedule_node_insert_filter(node, uset);
      //node = isl_schedule_node_child(node, 0);      
      isl_id_list_free(ids);      
    }
    node = isl_schedule_node_child(node, 0);
  }

  isl_id_list_free(io_ids);
  /* Insert the filter. */
  node = autosa_tree_move_up_to_kernel(node);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_filter(node, filter);
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, io_level);

  return node;
}

/* Allocate I/O buffers at each I/O level.
 * If two-level buffer is disabled, we will only allocate buffer 
 * at the innermost level for each group:
 * - drain group @ io_L1
 * - io group @ io_L1 (INT_IO) | io_L2 (EXT_IO)
 * If two-level buffer is turned on, we will also allocate buffers
 * at the outermost level for each group.
 */
static isl_stat compute_io_group_buffer(struct autosa_kernel *kernel,
                                        struct autosa_array_ref_group *group, struct autosa_gen *gen)
{
  isl_schedule_node *node;
  int io_level = group->io_level;
  int i;
  int two_level_buffer = gen->options->autosa->two_level_buffer;

  node = isl_schedule_get_root(group->io_schedule);

  /* Compute the group tiling at each I/O level. */
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  i = 1;
  assert(group->io_buffers == NULL);
  assert(group->n_io_buffer == 0);
  group->io_buffers = NULL;
  group->n_io_buffer = 0;
  while (i <= io_level)
  {
    isl_schedule_node *node_cp = NULL;
    node = isl_schedule_node_parent(node);
    if (isl_schedule_node_is_io_mark(node, i))
    {
      /* In the automatic mode, AutoSA only computes the tiling at L1
       * for drain group and I/O group with interior I/O, and at L2 for I/O 
       * group with exterior I/O.
       */
      (group->n_io_buffer)++;
      group->io_buffers = (struct autosa_io_buffer **)realloc(
          group->io_buffers, sizeof(struct autosa_io_buffer *) * group->n_io_buffer);
      group->io_buffers[group->n_io_buffer - 1] = autosa_io_buffer_alloc();          
      group->io_buffers[group->n_io_buffer - 1]->level = i;
      group->io_buffers[group->n_io_buffer - 1]->tile = NULL;

      node_cp = isl_schedule_node_copy(node);      
      if (group->group_type == AUTOSA_DRAIN_GROUP)
      {
        if (i == 1)
        {
          /* Compute the group tiling at this level */
          compute_group_bounds_drain_at_node(kernel, group, node_cp,
                                             group->io_buffers[group->n_io_buffer - 1]);
          autosa_array_ref_group_compute_tiling(
              group->io_buffers[group->n_io_buffer - 1]->tile, group);
          compute_drain_tiling_at_PE(kernel, group);
          if (gen->options->autosa->tuning_method == 1) {                        
            group->io_buffers[group->n_io_buffer - 1]->tuning_tile = TP_infer_tiled_array(gen, kernel, node, group, 0, 1);
            isl_schedule_node *new_node = isl_schedule_get_root(kernel->schedule);
            new_node = autosa_tree_move_down_to_pe(new_node, kernel->core);            
            group->tuning_pe_tile = TP_infer_tiled_array(gen, kernel, node, group, 0, 1); 
            isl_schedule_node_free(new_node);
          }
        }
        else
        {
          group->io_buffers[group->n_io_buffer - 1]->tile = NULL;
        }
      }
      else if (group->group_type == AUTOSA_IO_GROUP)
      {
        if ((group->io_type == AUTOSA_EXT_IO && i == 2) ||
            (group->io_type == AUTOSA_INT_IO && i == 1))
        {
          /* Compute the group tiling at this level. */
          compute_group_bounds_io_at_node(kernel, group, node_cp,
                                          group->io_buffers[group->n_io_buffer - 1]);
          autosa_array_ref_group_compute_tiling(
              group->io_buffers[group->n_io_buffer - 1]->tile, group);
          if (group->io_type == AUTOSA_INT_IO && i == 1)
          {
            compute_io_tiling_at_PE(kernel, group);
          }
          if (gen->options->autosa->tuning_method == 1) {
            group->io_buffers[group->n_io_buffer - 1]->tuning_tile = TP_infer_tiled_array(gen, kernel, node, group, 1, 1);
            if (group->io_type == AUTOSA_INT_IO && i == 1) {
              isl_schedule_node *new_node = isl_schedule_get_root(kernel->schedule);
              new_node = autosa_tree_move_down_to_pe(new_node, kernel->core);              
              group->tuning_pe_tile = TP_infer_tiled_array(gen, kernel, node, group, 1, 1); 
              isl_schedule_node_free(new_node);
            }
          }          
        }
        else
        {
          group->io_buffers[group->n_io_buffer - 1]->tile = NULL;
        }
      }
      else
      {
        group->io_buffers[group->n_io_buffer - 1]->tile = NULL;
      }
      if (two_level_buffer)
      {
        if (i == io_level)
        {          
          /* Compute the group tiling at the outermost I/O module. */
          if (group->group_type == AUTOSA_DRAIN_GROUP)
            compute_group_bounds_drain_at_node(kernel, group, node_cp, group->io_buffers[group->n_io_buffer - 1]);
          else if (group->group_type == AUTOSA_IO_GROUP)
            compute_group_bounds_io_at_node(kernel, group, node_cp, group->io_buffers[group->n_io_buffer - 1]);

          autosa_array_ref_group_compute_tiling(group->io_buffers[group->n_io_buffer - 1]->tile, group);
        }
      }      
      isl_schedule_node_free(node_cp);
      i++;
    }
  }

  isl_schedule_node_free(node);

  return isl_stat_ok;
}

/* Adjust the fields of "tile" to reflect the new input dimension "depth".
 * The dimension beyond "depth" are assumed not to affect the tile,
 * so they can simply be dropped.
 */
static int tile_adjust_depth(struct autosa_array_tile *tile, int depth)
{
  int i;

  if (tile->depth == depth)
    return 0;

  for (i = 0; i < tile->n; ++i)
  {
    tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb,
                                          isl_dim_in, depth, tile->depth - depth);
    if (!tile->bound[i].lb)
      return -1;
    if (!tile->bound[i].shift)
      continue;
    tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift,
                                             isl_dim_in, depth, tile->depth - depth);
    if (!tile->bound[i].shift)
      return -1;
  }

  tile->depth = depth;

  return 0;
}

/* Compute the number of outer schedule tile dimensions that affect
 * the offset of "tile".
 * If there is no such dimension, then return the index
 * of the first kernel dimension, i.e., data->kernel_depth.
 */
static int compute_tile_depth(struct autosa_group_data *data,
                              struct autosa_array_tile *tile)
{
  int i, j;

  for (j = tile->depth - 1; j >= data->kernel_depth; --j)
  {
    for (i = 0; i < tile->n; ++i)
    {
      isl_aff *lb;
      isl_aff *shift;

      lb = tile->bound[i].lb;
      if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
        break;

      shift = tile->bound[i].shift;
      if (!shift)
        continue;
      if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
        break;
    }
    if (i < tile->n)
      break;
  }

  return ++j;
}

/* Determine the number of schedule dimensions that affect the offset of the
 * local tile "tile" and store the result in tile->depth, with
 * a lower bound of data->kernel_depth.
 * Also adjust the fields of the tile to only refer to the tile->depth
 * outer schedule dimensions.
 */
static isl_stat tile_set_depth(struct autosa_group_data *data,
                               struct autosa_array_tile *tile)
{
  if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0)
    return isl_stat_error;

  return isl_stat_ok;
}

/* Internal struct used for update_group_simd. */
struct update_group_simd_data
{
  struct autosa_array_ref_group *group;
  struct autosa_kernel *kernel;
  int updated;
};

/* Examine if there is any array references in the "group" under the SIMD loop.
 * If so, exmaine if the array reference has a stride of 1 under the SIMD loop.
 * If so, update the SIMD lane of the "group".
 */
static isl_bool update_group_simd(__isl_keep isl_schedule_node *node, void *user)
{
  struct update_group_simd_data *data = (struct update_group_simd_data *)user;

  if (isl_schedule_node_get_type(node) == isl_schedule_node_mark)
  {
    isl_id *id;
    isl_union_set *domain;
    struct autosa_array_ref_group *group = data->group;

    id = isl_schedule_node_mark_get_id(node);
    if (strcmp(isl_id_get_name(id), "simd"))
    {
      isl_id_free(id);
      return isl_bool_true;
    }

    isl_id_free(id);
    node = isl_schedule_node_child(node, 0);
    domain = isl_schedule_node_get_domain(node);
    for (int i = 0; i < group->n_ref; i++)
    {
      struct autosa_stmt_access *ref = group->refs[i];
      for (int j = 0; j < ref->n_io_info; j++)
      {
        struct autosa_io_info *info = ref->io_info[j];
        if (info->io_type == group->io_type && !isl_vec_cmp(info->dir, group->dir))
        {
          /* Test if either the source or dest of the dependence associated with
           * the array reference is intersected with the current loop domain. */
          struct autosa_dep *dep = info->dep;
          isl_basic_map *bmap;
          isl_map *map;
          isl_set *src, *dest;
          isl_union_set *uset;
          bmap = isl_basic_map_copy(dep->isl_dep);
          map = isl_map_from_basic_map(bmap);
          map = isl_map_factor_domain(map);
          src = isl_map_domain(isl_map_copy(map));
          dest = isl_map_range(map);
          uset = isl_union_set_union(isl_union_set_from_set(src),
                                     isl_union_set_from_set(dest));
          uset = isl_union_set_intersect(uset, isl_union_set_copy(domain));
          if (!isl_union_set_is_empty(uset))
          {
            if (ref->simd_stride == 1) {
              group->n_lane = data->kernel->simd_w;
              data->updated = 1;
            }
          }
          isl_union_set_free(uset);
        }
      }
    }
    isl_union_set_free(domain);
  }

  return isl_bool_true;
}

/* Select the data pack factor for I/O buffers. For this function, the array
 * that the I/O group is assoicated with is a sparse matrix.
 * The unit of data packing factor is the non_zero_num elements + one offset.
 */
static isl_stat compute_io_group_data_pack_sparse(
  struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
  struct autosa_gen *gen, int max_n_lane)
{
  isl_schedule_node *node;
  isl_union_map *sizes;
  int *data_pack_ubs = NULL;
  struct update_group_simd_data data;
  int ele_size = group->array->size; // bytes
  /* Given the maximal DRAM port width as 64 Bytes, 
   * compute the maximal data pack factor. */
  //if (max_n_lane == -1)
  //  max_n_lane = 64 / ele_size;

  group->n_lane = 1;
  node = isl_schedule_get_root(kernel->schedule);
  data.group = group;
  data.kernel = kernel;
  data.updated = 0;
  isl_schedule_node_foreach_descendant_top_down(node, &update_group_simd, &data);
  isl_schedule_node_free(node);

  /* Update the group n_lane considering the sparse information */
  if (group->n_lane % kernel->vec_len != 0) {
    printf("[AutoSA] Error: The sparse block size is not a sub-multiple of the SIMD factor. Abort!\n");
    exit(1);
  }
  group->n_lane /= kernel->vec_len;
  
  /* If data packing is disabled, simply update the data packing factor of 
   * each I/O buffer to the SIMD lanes that are required. 
   */
  if (!gen->options->autosa->data_pack) {
    for (int i = 0; i < group->io_level; i++) {
      struct autosa_io_buffer *buf = group->io_buffers[i];
      buf->n_lane = group->n_lane;
      /* Update the sparse information */
      buf->sparse = 1;
      buf->vec_len = kernel->vec_len;
    }
    return isl_stat_ok;
  }

  int cur_n_lane = group->n_lane;
  int status = false;
  /* Parse the data pack settings. */
  sizes = extract_sizes_from_str(gen->ctx, gen->options->autosa->data_pack_sizes);
  //data_pack_ubs = read_data_pack_sizes(sizes, 3);
  data_pack_ubs = read_data_pack_sizes_array(sizes, group->array->name);
  if (!data_pack_ubs) {
    /* Use the default numbers. */
    data_pack_ubs = isl_alloc_array(gen->ctx, int, 3);
    data_pack_ubs[0] = 8;
    data_pack_ubs[1] = 32;
    data_pack_ubs[2] = 64;
  }

  int cur_max_n_lane;
  for (int i = 0; i < group->io_level; i++) {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    if (i == 0)
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[0] / (kernel->n_nzero * ele_size + 1));
    else if (i > 0 && i < group->io_level - 1)
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[1] / (kernel->n_nzero * ele_size + 1));
    else
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[2] / ((kernel->n_nzero + kernel->n_meta_data) * ele_size));
    if (buf->tile) {      
      int n_lane = cur_n_lane;
      isl_val *size = isl_val_copy(buf->tile->bound[group->array->n_index - 1].size);
      if (i == group->io_level - 1 && group->local_array->host_serialize) {
        for (int n = 0; n < group->array->n_index - 1; n++) {
          size = isl_val_mul(size, isl_val_copy(buf->tile->bound[n].size));
        }        
      }      
      size = isl_val_div(size, isl_val_int_from_si(gen->ctx, kernel->vec_len));

      while (n_lane <= cur_max_n_lane) {
        /* The lane should be multiples of SIMD lane. */
        if (n_lane % group->n_lane == 0) {
          isl_val *val = isl_val_int_from_si(gen->ctx, n_lane);
          /* The lane should be sub-multiples of the last dim of the array. */
          if (isl_val_is_divisible_by(size, val)) {
            cur_n_lane = n_lane;
            status = true;
          }
          isl_val_free(val);
        }
        //n_lane *= 2;
        n_lane += 1;
      }
      if (status) {
        buf->n_lane = cur_n_lane;        
      } else {
        printf("[AutoSA] Error: Cannot find data pack factors as sub-multiples of the last dim of the local array. Abort!\n");
        printf("[AutoSA] Please try to use different tiling factors.\n");
        exit(1);
      }
      isl_val_free(size);
    } else {
      buf->n_lane = cur_n_lane;
    }    
    /* Update the sparse information */
    buf->sparse = 1;
    buf->vec_len = kernel->vec_len;
  }
  isl_union_map_free(sizes);
  free(data_pack_ubs);

  return isl_stat_ok;
}

/* Select the data pack factor for I/O buffers. The data pack factor
 * should be sub-multiples of the last dimension of the local array.
 * Meanwhile, it should also be sub-multiples of the data pack factors 
 * selected for the upper-level I/O buffers.
 * 
 * If SIMD vectorization is enabled, and the data stored in the I/O buffer is 
 * to be vectorized, the data pack factor should also be multiples of the SIMD factor.
 */
static isl_stat compute_io_group_data_pack(struct autosa_kernel *kernel,
                                           struct autosa_array_ref_group *group,
                                           struct autosa_gen *gen,
                                           int max_n_lane)
{
  isl_schedule_node *node;
  isl_union_map *sizes;
  isl_val *size;
  int *data_pack_ubs = NULL;
  struct update_group_simd_data data;
  int ele_size = group->array->size; // bytes
  /* Given the maximal DRAM port width as 64 Bytes, 
   * compute the maximal data pack factor. */
  if (max_n_lane == -1)
    max_n_lane = 64 / ele_size;
  /* Parse the data pack settings. */
  /* For L1 buffers, we restrain the fifo widths to be no more than 256 bits 
   * given hardware consideration (on Xilinx). 
   * Specifically, for FIFOs with depth * width > 512bits, HLS will 
   * use BRAM/SRL to implement FIFOs, which could potentially increase 
   * the BRAM/LUT usage by a great scale and cause routing failure.
   * 
   * Furthermore, for L1 buffers reside at the io_L1 level (beside PEs), we 
   * furtehr restrain the FIFO widths to be no more than 64 bits to mitigate 
   * the potential routing congestion.
   */  
  sizes = extract_sizes_from_str(gen->ctx, gen->options->autosa->data_pack_sizes);  
  data_pack_ubs = read_data_pack_sizes_array(sizes, group->array->name);  
  if (!data_pack_ubs)
  {
    /* Use the default numbers. */
    data_pack_ubs = isl_alloc_array(gen->ctx, int, 3);
    data_pack_ubs[0] = 16;
    //data_pack_ubs[1] = 32;
    data_pack_ubs[1] = 64;
    data_pack_ubs[2] = 64;
  }
  //std::cout << data_pack_ubs[0] << std::endl;
  //std::cout << data_pack_ubs[1] << std::endl;
  //std::cout << data_pack_ubs[2] << std::endl;

  /* Examine if any of the array reference in the group is in used by SIMD loop.
   * The default SIMD lane for the group is 1. 
   * If any of the array references in the group is under the SIMD loop, and 
   * if the stride of reference under the loop is one. The SIMD lane of the 
   * group is then updated to the SIMD lane of the loop.
   */
  group->n_lane = 1;
  node = isl_schedule_get_root(kernel->schedule);
  data.group = group;
  data.kernel = kernel;
  data.updated = 0;
  isl_schedule_node_foreach_descendant_top_down(node, &update_group_simd, &data);
  isl_schedule_node_free(node);

  if (gen->options->autosa->tuning_method == 1) {    
    /* Update the data packing factor */
    for (int i = 0; i < group->io_level; i++) {
      struct autosa_io_buffer *buf = group->io_buffers[i];
      if (buf->tuning_tile && buf->tuning_tile->data_pack_factor_inter == NULL) {        
        /* Inter */
        class TPParameter *dp = new TPParameter("p" + std::to_string(kernel->tuning_program->params.size()));
        dp->tune = false;
        dp->attr = "data_pack_factor";
        dp->tags.insert("auto_infer");
        dp->tags.insert("power_of_two");
        /* Update the bounds */
        /* lb */
        if (data.updated == 0) {          
          dp->bounds.push_back(std::make_shared<TPExpr>("literal", new TPConst(1)));
        } else {
          /* Find the SIMD tiling factor */
          for (auto param : kernel->tuning_program->params) {
            if (param->attr == "SIMD_tiling_factor") {              
              dp->bounds.push_back(std::make_shared<TPExpr>("literal", param->dup()));
              dp->multiples.push_back(std::make_shared<TPExpr>("literal", param->dup()));
            }
          }
        }
        /* ub */
        int user_max_n_lane;
        if (i == 0)
          user_max_n_lane = data_pack_ubs[0] / ele_size;
        else if (i > 0 && i < group->io_level - 1)
          user_max_n_lane = data_pack_ubs[1] / ele_size;
        else
          user_max_n_lane = data_pack_ubs[2] / ele_size;
        TPExpr *ub = buf->tuning_tile->sizes[buf->tuning_tile->sizes.size() - 1]->dup();
        ub = ub->min(new TPExpr("literal", new TPConst(user_max_n_lane)));
        ub = ub->max(dp->bounds[0]->dup());
        dp->bounds.push_back(std::shared_ptr<TPExpr>(ub));        
        dp->divisors.push_back(std::shared_ptr<TPExpr>(buf->tuning_tile->sizes[buf->tuning_tile->sizes.size() - 1]->dup()));
        assert(dp->bounds.size() == 2);    
        buf->tuning_tile->data_pack_factor_inter = dp;
        kernel->tuning_program->params.push_back(dp);
        kernel->tuning_program->param_map[dp->name] = dp;

        /* Intra */
        if (data.updated == 0) {
          buf->tuning_tile->data_pack_factor_intra = std::make_shared<TPExpr>("literal", new TPConst(1));          
        } else {
          /* Find the SIMD tiling factor */
          for (auto param : kernel->tuning_program->params) {
            if (param->attr == "SIMD_tiling_factor") {              
              buf->tuning_tile->data_pack_factor_intra = std::make_shared<TPExpr>("literal", param->dup());              
            }
          }
        }

        break;
      }
    }            
  }

  if (max_n_lane % group->n_lane != 0)
  {
    printf("[AutoSA] Error: The data is not aligned to the DRAM port. Abort!\n");
    printf("[AutoSA] Please try to use a SIMD factor as sub-multiples of %d.\n", max_n_lane);
    exit(1);
  }

  /* If data packing is disabled, simply update the data packing factor of 
   * each I/O buffer to the SIMD lanes that are required.
   */
  if (!gen->options->autosa->data_pack)
  {
    for (int i = 0; i < group->io_level; i++)
    {
      struct autosa_io_buffer *buf = group->io_buffers[i];
      buf->n_lane = group->n_lane;
    }
    return isl_stat_ok;
  }

  int cur_n_lane = group->n_lane;
  int status = false;
  int cur_max_n_lane;
  for (int i = 0; i < group->io_level; i++)
  {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    if (i == 0)
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[0] / ele_size);
    else if (i > 0 && i < group->io_level - 1)
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[1] / ele_size);
    else
      cur_max_n_lane = std::max(group->n_lane, data_pack_ubs[2] / ele_size);
    if (buf->tile && group->array->n_index > 0)
    {      
      size = isl_val_copy(buf->tile->bound[group->array->n_index - 1].size);
compute_data_pack:      
      int n_lane = cur_n_lane;
      while (n_lane <= cur_max_n_lane)
      {
        /* The lane should be multiples of SIMD lane. */
        if (n_lane % group->n_lane == 0)
        {
          isl_val *val = isl_val_int_from_si(gen->ctx, n_lane);
          /* The lane should be sub-multiples of the last dim of the array. */
          if (isl_val_is_divisible_by(size, val))
          {
            cur_n_lane = n_lane;
            status = true;
          }
          isl_val_free(val);
        }
        n_lane = n_lane * 2;
      }
      if (status)
      {
        buf->n_lane = cur_n_lane;
      }
      else
      {
        printf("[AutoSA] Error: Cannot find data pack factors as sub-multiples of the last dim of the local array. Abort!\n");
        printf("[AutoSA] Please try to use different tiling factors.\n");
        exit(1);
      }
      isl_val_free(size);      
    } else if (i == group->io_level - 1 && !gen->options->autosa->host_serialize) {
      /* If it is the outermost loop, try to extend the data packing factor again. 
       * If the host serialization is enabled, as there is a re-packing later.
       * We won't do anything here. 
       */
      /* Locate the next buffer. */            
      struct autosa_io_buffer *nxt_buf;
      for (int j = i; j >= 0; j--) {
        nxt_buf = group->io_buffers[j];
        if (nxt_buf->tile) 
          break;                  
      }
      if (nxt_buf->tile) {        
        size = isl_val_copy(nxt_buf->tile->bound[group->array->n_index - 1].size);
        goto compute_data_pack;
      }        
    } else
    {
      buf->n_lane = cur_n_lane;
    }
  }
  isl_union_map_free(sizes);
  free(data_pack_ubs);

  return isl_stat_ok;
}

/* Lift up the L1 I/O buffer between the paralle loops and non-parallel loops
 * in the array loop band.
 * If there is no array loop band. Lift up the L1 I/O buffer above the array mark.
 */
static isl_stat hoist_L1_io_buffer_local_reduce(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  struct autosa_gen *gen,
  struct autosa_group_data *data)
{
  struct autosa_io_buffer *cur_buffer;
  isl_schedule_node *node, *node_cp;
  int n;

  /* Find the L1 buffer. */
  for (int i = 1; i <= group->io_level; i++) 
  {
    cur_buffer = group->io_buffers[i - 1];
    if (cur_buffer->tile)
      break;
  }

  autosa_array_tile_free(cur_buffer->tile);
  node = isl_schedule_get_root(group->io_schedule);
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, cur_buffer->level);
  node = insert_io_module_ids(gen, kernel, node, group->space_dim, cur_buffer->level);
  node = autosa_tree_move_up_to_array(node);

  if (kernel->array_part_w > 0) {
    int pos = 0;
    node = isl_schedule_node_parent(node);
    n = isl_schedule_node_band_n_member(node);
    for (pos = n - 1; pos >= 0; pos--)
    {
      if (isl_schedule_node_band_member_get_coincident(node, pos))
        break;
    }
    if (pos == n - 1) {
      node = isl_schedule_node_child(node, 0);
    } else {
      node = isl_schedule_node_band_split(node, pos + 1);
      node = isl_schedule_node_child(node, 0);      
    }
  } 
  
  if (group->group_type == AUTOSA_DRAIN_GROUP)
    compute_group_bounds_drain_at_node(kernel, group, node, cur_buffer);
  else if (group->group_type == AUTOSA_IO_GROUP)
    compute_group_bounds_io_at_node(kernel, group, node, cur_buffer);
  autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
  
  return isl_stat_ok;
}

struct update_int_io_L1_buffer_data {
  struct autosa_array_ref_group *group;  
  struct autosa_kernel *kernel;
  bool inserted;
  bool tile_computed;
  int depth;
};

static __isl_give isl_schedule_node *update_int_io_L1_depth(__isl_take isl_schedule_node *node, void *user)
{
  struct update_int_io_L1_buffer_data *data = (struct update_int_io_L1_buffer_data *)user;
  int under_simd, n;
  struct autosa_array_ref_group *group;
  isl_schedule_node *insert_node = NULL;  
  isl_union_set *domain;
  int is_carried = 0;

  if (data->inserted)
    return node;
  /* Examine if the node is under the SIMD mark */
  under_simd = is_node_under_simd(node);
  if (under_simd)
    return node;
  
  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;

  domain = isl_schedule_node_get_domain(node);
  if (isl_union_set_is_empty(domain)) {
    isl_union_set_free(domain);
    return node;
  }
  isl_union_set_free(domain);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  n = isl_schedule_node_band_n_member(node);
  /* Examine if the dependences of the current I/O group are carreid by the current band. */
  group = data->group;
  for (int i = 0; i < n; i++) {
    isl_schedule_node *node_tmp = isl_schedule_node_copy(node);
    if (n > 1) {
      if (i > 0) {
        node_tmp = isl_schedule_node_band_split(node_tmp, i);
        node_tmp = isl_schedule_node_child(node_tmp, 0);
      }
      if (n - i - 1 > 0) {
        node_tmp = isl_schedule_node_band_split(node_tmp, 1);
      }
    }

    for (int j = 0; j < group->n_ref; j++) {
      struct autosa_stmt_access *ref = group->refs[j];
      for (int k = 0; k < ref->n_io_info; k++) {
        struct autosa_io_info *io_info = ref->io_info[k];
        if (io_info->io_type == group->io_type && 
            !isl_vec_cmp(io_info->dir, group->dir)) {
          if (is_dep_carried_by_node(io_info->dep->isl_dep, node_tmp)) {
            ///* Insert the I/O buffer below the current node */
            //insert_node = isl_schedule_node_copy(node_tmp);
            //insert_node = isl_schedule_node_child(insert_node, 0);
            is_carried = 1;
            break;
          }
        }
      }
      if (is_carried)
        break;      
    }

    if (is_carried) {
      insert_node = isl_schedule_node_copy(node_tmp);
      //insert_node = isl_schedule_node_child(insert_node, 0);
      isl_schedule_node_free(node_tmp);
      break;
    }

    isl_schedule_node_free(node_tmp);
  }

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, insert_node, isl_schedule_node_get_ctx(insert_node));
//#endif

  if (insert_node) {
    data->depth = isl_schedule_node_get_schedule_depth(insert_node);
    data->inserted = true;
    isl_schedule_node_free(insert_node);
  }
  
  return node;
}

static __isl_give isl_schedule_node *update_int_io_L1_buffer(
  __isl_take isl_schedule_node *node, void *user)
{
  struct update_int_io_L1_buffer_data *data = (struct update_int_io_L1_buffer_data *)user;
  int under_simd;
  isl_union_set *domain;
  struct autosa_array_ref_group *group;

  ///* Examine if the node is under the SIMD mark */
  //under_simd = is_node_under_simd(node);
  //if (under_simd)
  //  return node;

  if (data->tile_computed)
    return node;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;
  
  domain = isl_schedule_node_get_domain(node);
  if (isl_union_set_is_empty(domain)) {
    isl_union_set_free(domain);
    return node;
  }
  isl_union_set_free(domain);

  if (isl_schedule_node_get_schedule_depth(node) < data->depth) {
    /* Check the child node */
    node = isl_schedule_node_child(node, 0);
  }

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  if (isl_schedule_node_get_schedule_depth(node) == data->depth) {
    /* Find the L1 buffer */
    struct autosa_io_buffer *cur_buffer;
    group = data->group;
    for (int i = 1; i < group->io_level; i++) {
      cur_buffer = group->io_buffers[i - 1];
      if (cur_buffer->tile)
        break;
    }

    autosa_array_tile_free(cur_buffer->tile);
    if (group->group_type == AUTOSA_DRAIN_GROUP)
      compute_group_bounds_drain_at_node(data->kernel, group, node, cur_buffer);
    else if (group->group_type == AUTOSA_IO_GROUP)
      compute_group_bounds_io_at_node(data->kernel, group, node, cur_buffer);
    autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);

    data->tile_computed = true;
  }

  return node;
}

//static __isl_give isl_schedule_node *update_int_io_L1_buffer(__isl_take isl_schedule_node *node, void *user)
//{
//  struct update_int_io_L1_buffer_data *data = (struct update_int_io_L1_buffer_data *)user;
//  int under_simd, n;
//  struct autosa_array_ref_group *group;
//  isl_schedule_node *insert_node = NULL;  
//  isl_union_set *domain;
//  int is_carried = 0;
//
//  if (data->inserted)
//    return node;
//  /* Examine if the node is under the SIMD mark */
//  under_simd = is_node_under_simd(node);
//  if (under_simd)
//    return node;
//  
//  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
//    return node;
//
//  domain = isl_schedule_node_get_domain(node);
//  if (isl_union_set_is_empty(domain)) {
//    isl_union_set_free(domain);
//    return node;
//  }
//  isl_union_set_free(domain);
//
//  n = isl_schedule_node_band_n_member(node);
//  /* Examine if the dependences of the current I/O group are carreid by the current band. */
//  group = data->group;
//  for (int i = 0; i < n; i++) {
//    isl_schedule_node *node_tmp = isl_schedule_node_copy(node);
//    if (n > 1) {
//      if (i > 0) {
//        node_tmp = isl_schedule_node_band_split(node_tmp, i);
//        node_tmp = isl_schedule_node_child(node_tmp, 0);
//      }
//      if (n - i - 1 > 0) {
//        node_tmp = isl_schedule_node_band_split(node_tmp, 1);
//      }
//    }
//
//    for (int j = 0; j < group->n_ref; j++) {
//      struct autosa_stmt_access *ref = group->refs[j];
//      for (int k = 0; k < ref->n_io_info; k++) {
//        struct autosa_io_info *io_info = ref->io_info[k];
//        if (io_info->io_type == group->io_type && 
//            !isl_vec_cmp(io_info->dir, group->dir)) {
//          if (is_dep_carried_by_node(io_info->dep->isl_dep, node_tmp)) {
//            ///* Insert the I/O buffer below the current node */
//            //insert_node = isl_schedule_node_copy(node_tmp);
//            //insert_node = isl_schedule_node_child(insert_node, 0);
//            is_carried = 1;
//            break;
//          }
//        }
//      }
//      if (is_carried)
//        break;      
//    }
//
//    if (!is_carried) {
//      insert_node = isl_schedule_node_copy(node_tmp);
//      insert_node = isl_schedule_node_child(insert_node, 0);
//      break;
//    }
//
//    isl_schedule_node_free(node_tmp);
//  }
//
//  if (insert_node) {      
////#ifdef _DEBUG
////    DBGSCHDNODE(stdout, insert_node, isl_schedule_node_get_ctx(insert_node));
////#endif
//
//    /* Find the L1 buffer */
//    struct autosa_io_buffer *cur_buffer;
//    for (int i = 1; i < group->io_level; i++) {
//      cur_buffer = group->io_buffers[i - 1];
//      if (cur_buffer->tile)
//        break;
//    }
//    autosa_array_tile_free(cur_buffer->tile);
//    if (group->group_type == AUTOSA_DRAIN_GROUP)
//      compute_group_bounds_drain_at_node(data->kernel, group, insert_node, cur_buffer);
//    else if (group->group_type == AUTOSA_IO_GROUP)
//      compute_group_bounds_io_at_node(data->kernel, group, insert_node, cur_buffer);
//    autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
//
////#ifdef _DEBUG    
////    printf("%d\n", cur_buffer->tile->depth);
////#endif
//
//    isl_schedule_node_free(insert_node);
//    data->inserted = true;
//  }
//  
//  return node;
//}

static __isl_give isl_schedule_node *insert_io_L1_mark(
  __isl_take isl_schedule_node *node, void *user)
{
  int *depth = (int *)user;

  if (isl_schedule_node_get_schedule_depth(node) == *depth && 
      isl_schedule_node_get_type(node) == isl_schedule_node_band) 
  {
    isl_id *id;
    id = isl_id_alloc(isl_schedule_node_get_ctx(node), "io_L1", NULL);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_mark(node, id);
    node = isl_schedule_node_parent(node);
  }

  return node;
}

/* This function generates a new io schedule when the L1 IO buffer is lowered.
 * Specifically, the L1 io band node with its mark node will be sunk to schedule
 * depth of (depth - 1). 
 * This function assume that the entire schedule tree is fully permutable. 
 * The legality should be checked before calling this function.
 */
static __isl_give isl_schedule *generate_io_L1_lower_schedule(
  __isl_keep isl_schedule *schedule,
  struct autosa_kernel *kernel,
  int depth)
{
  isl_schedule_node *node;
  isl_schedule *new_schedule;

  new_schedule = isl_schedule_dup(schedule);
  node = isl_schedule_get_root(new_schedule);
  isl_schedule_free(new_schedule);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  node = autosa_tree_move_down_to_io_mark(node, kernel->core, 1);
  node = isl_schedule_node_delete(node);
  node = isl_schedule_node_parent(node);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif
  /* Sink the L1 band to (depth - 1) */
  node = autosa_node_sink_to_depth(node, depth - 1);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif
  /* Insert the io_L1 mark */
  int depth_inc = depth - 1;
  node = isl_schedule_node_map_descendant_bottom_up(node, &insert_io_L1_mark, &depth_inc);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  new_schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);
  return new_schedule;
}

/* This function tries to lower the L1 buffer for the interior I/O module (for external array)
 * to help reduce the memory resource usage.
 * 
 * It first checks if the I/O group is with the interior I/O, and if the array is
 * an external array.
 * If so, one L1 I/O buffer is allocated by default. 
 * Next, it examines if there is at least one parallel loop (independent of the 
 * reuse dependence) from innermost. L1 buffer will be lowered to the boundary
 * between the non-parallel and parallel loops.
 */
static isl_stat lower_int_io_L1_buffer(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  struct autosa_gen *gen)
{
  if (!(group->io_type == AUTOSA_INT_IO && group->local_array->array_type == AUTOSA_EXT_ARRAY))
    return isl_stat_ok;

  isl_schedule_node *node;
  struct update_int_io_L1_buffer_data data = {group, kernel, false, false, -1};

  node = isl_schedule_get_root(group->io_schedule);
  /* Insert the domain filter for the current I/O group */
  node = autosa_tree_move_down_to_kernel(node);
  /* This function only works for copy-in modules */
  node = insert_io_group_domain(node, group, kernel, gen, 1);  

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, gen->ctx);
//  //printf("%s\n", group->array->name);
//#endif
  /* Update the depth to insert the buffer */
  node = isl_schedule_node_map_descendant_bottom_up(node, &update_int_io_L1_depth, &data);
  isl_schedule_node_free(node);
  
  if (data.inserted) {
    /* Generate the new I/O schedule */
    group->io_L1_lower_schedule = 
      generate_io_L1_lower_schedule(group->io_schedule, kernel, data.depth);
    /* Update the L1 buffer */
    node = isl_schedule_get_root(group->io_L1_lower_schedule);    
    node = isl_schedule_node_map_descendant_bottom_up(node, &update_int_io_L1_buffer, &data);
    isl_schedule_node_free(node);
  }

  return isl_stat_ok;
}

/* This function is used when lower IO L1 buffer is enabled.
 * An extra second-level buffer is inserted to increase the effective DRAM BW.
 */
static isl_stat insert_L2_io_buffer(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  struct autosa_gen *gen
){
  if (!(group->io_type == AUTOSA_INT_IO && group->local_array->array_type == AUTOSA_EXT_ARRAY))
    return isl_stat_ok;

  isl_schedule_node *node;
  struct autosa_io_buffer *buffer;

  node = isl_schedule_get_root(group->io_L1_lower_schedule);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  buffer = group->io_buffers[group->io_level - 1];
  if (group->group_type == AUTOSA_DRAIN_GROUP)
    compute_group_bounds_drain_at_node(kernel, group, node, buffer);
  else if (group->group_type == AUTOSA_IO_GROUP)
    compute_group_bounds_io_at_node(kernel, group, node, buffer);

  autosa_array_ref_group_compute_tiling(buffer->tile, group);
  isl_schedule_node_free(node);

  return isl_stat_ok;
}

/* This function hoists the L1 I/O buffer to save the data communication.
 * It tries to hoist up the buffer if the local buffer size is irrelavant to the outer loop.
 */
static isl_stat hoist_L1_io_buffer(
  struct autosa_kernel *kernel, 
  struct autosa_array_ref_group *group,
  struct autosa_gen *gen,
  struct autosa_group_data *data  
) {
  struct autosa_io_buffer *cur_buffer;
  int io_level = group->io_level;
  isl_schedule_node *node, *node_cp;
  int n, i;  
  std::vector<isl_val *> cur_dims;
  std::vector<isl_val *> prev_dims;
  isl_union_set *L1_io_buffer_domain = NULL;
  int L1_io_buffer_depth = -1;
  
  struct autosa_array_tile *cur_tile;

  for (int i = io_level; i >= 1; i--) {
    cur_buffer = group->io_buffers[i - 1];
    if (cur_buffer->tile)
      break;
  }

  for (int i = 0; i < cur_buffer->tile->n; i++) {
    prev_dims.push_back(cur_buffer->tile->bound[i].size);
  }    
  cur_tile = cur_buffer->tile;

  node = isl_schedule_get_root(group->io_schedule);
  //DBGSCHDNODE(stdout, node, gen->ctx);  
  /* Insert the filter ids. */
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, cur_buffer->level);  
  node = insert_io_module_ids(gen, kernel, node, group->space_dim, cur_buffer->level);  
  node = autosa_tree_move_up_to_array(node);
  node = isl_schedule_node_parent(node);
  //DBGSCHDNODE(stdout, node, gen->ctx);  
  n = isl_schedule_node_band_n_member(node);  
  for (i = n - 1; i > 0; i--) {
    node_cp = isl_schedule_node_copy(node);
    node_cp = isl_schedule_node_band_split(node_cp, i);
    node_cp = isl_schedule_node_child(node_cp, 0);
    if (group->group_type == AUTOSA_DRAIN_GROUP)
      compute_group_bounds_drain_at_node(kernel, group, node_cp, cur_buffer);
    else if (group->group_type == AUTOSA_IO_GROUP)
      compute_group_bounds_io_at_node(kernel, group, node_cp, cur_buffer);
    autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
    /* Test if the last dim is changed. */    
    bool is_equal = true;
    for (int d = 0; d < cur_buffer->tile->n; d++) {
      if (!isl_val_eq(cur_buffer->tile->bound[d].size, prev_dims[d])) {
        //DBGVAL(stdout, cur_buffer->tile->bound[d].size, gen->ctx);
        //DBGVAL(stdout, prev_dims[d], gen->ctx);
        is_equal = false;
        break;
      }
    }    
    autosa_array_tile_free(cur_buffer->tile);    
    if (!is_equal) {            
      isl_schedule_node_free(node_cp);
      break;
    } else {      
      L1_io_buffer_depth = isl_schedule_node_get_schedule_depth(node_cp);
      L1_io_buffer_domain = isl_union_set_free(L1_io_buffer_domain);
      /* Compute the domain. */      
      isl_union_map *partial = isl_schedule_node_band_get_partial_schedule_union_map(node_cp);
      /* Delete the module id filter */
      node_cp = autosa_tree_move_up_to_kernel(node_cp);
      node_cp = isl_schedule_node_child(node_cp, 0); 
      node_cp = isl_schedule_node_child(node_cp, 0); 
      node_cp = isl_schedule_node_delete(node_cp);
      node_cp = autosa_tree_move_down_to_array(node_cp, kernel->core);
      node_cp = isl_schedule_node_parent(node_cp);
      isl_union_set *domain = isl_schedule_node_get_domain(node_cp);
      partial = isl_union_map_intersect_domain(partial, domain);
      isl_union_set *range = isl_union_map_range(isl_union_map_copy(partial));      
      range = isl_union_set_lexmin(range);      
      partial = isl_union_map_intersect_range(partial, range);      
      L1_io_buffer_domain = isl_union_map_domain(partial);
      isl_schedule_node_free(node_cp);
    }
  }  
  isl_schedule_node_free(node);
  cur_buffer->tile = cur_tile;
  cur_buffer->hoist_depth = L1_io_buffer_depth;
  cur_buffer->hoist_domain = L1_io_buffer_domain;

  return isl_stat_ok;
}

/* This function tries to hoist the L2 I/O buffer to increase the memory 
 * coelescing. 
 * 
 * Specifically, we will start from the original position where the L2 buffer
 * in inserted. We will compare if the last dimension of the L2 buffer is 
 * larger than the last dimension of the L1 buffer.
 * If not, we will try to hoist the L2 buffer until the last dimension is increased.
 * 
 * If we could not increase the last dimension, we will reallocate the L2 buffer
 * at the outermost I/O level. And try to hoist up the buffer if the local 
 * buffer size is irrelevant to the outer loop. This helps save the communication.
 * 
 * If the buffer location is not changed, we will last check if the last dimension
 * of the array can be packed as multiples of 512 bits. Since the maximal DRAM
 * port width is 512 bits.
 * This is helpful because on Xilinx FPGAs, we limit the maximal on-chip fifo 
 * width to 256 bits. Repacking the data to 512 bits at the L2 I/O buffer 
 * could help improve the effective DRAM bandwidth.
 *
 * If it is not a multiple of 512 bits, there is no benefit overall to generate
 * L2 I/O buffers. In this case, we will free up the L2 I/O buffer. 
 * No L2 I/O buffer is generated.
 */
static isl_stat hoist_L2_io_buffer(
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group, 
  struct autosa_gen *gen,
  struct autosa_group_data *data)
{
  struct autosa_io_buffer *cur_buffer, *nxt_buffer;
  int io_level = group->io_level;
  bool is_last_dim_equal = false;
  isl_val *cur_last_dim, *nxt_last_dim;
  isl_schedule_node *node, *node_cp;
  int i, n;
  int old_depth, new_depth;

  cur_buffer = group->io_buffers[io_level - 1];
  for (int i = io_level - 1; i >= 1; i--)
  {
    nxt_buffer = group->io_buffers[i - 1];
    if (nxt_buffer->tile)
      break;
  }

  /* Compare if the last dimension of the current buffer
   * and the next buffer equals.
   */
  cur_last_dim = cur_buffer->tile->bound[cur_buffer->tile->n - 1].size;
  nxt_last_dim = nxt_buffer->tile->bound[nxt_buffer->tile->n - 1].size;
  is_last_dim_equal = isl_val_eq(cur_last_dim, nxt_last_dim);

  if (is_last_dim_equal)
  {
    /* Try to hoist the io buffer until the last dimenison is increased. */
    autosa_array_tile_free(cur_buffer->tile);
    node = isl_schedule_get_root(group->io_schedule);
    /* Insert the filter ids. */
    node = autosa_tree_move_down_to_io_mark(node, kernel->core, io_level);
    node = insert_io_module_ids(gen, kernel, node, group->space_dim, io_level);    
    node = autosa_tree_move_up_to_array(node);    
    node = isl_schedule_node_parent(node);
    n = isl_schedule_node_band_n_member(node);
    for (i = n - 1; i > 0; i--)
    {
      node_cp = isl_schedule_node_copy(node);
      node_cp = isl_schedule_node_band_split(node_cp, i);
      node_cp = isl_schedule_node_child(node_cp, 0);
      if (group->group_type == AUTOSA_DRAIN_GROUP)
        compute_group_bounds_drain_at_node(kernel, group, node_cp, cur_buffer);
      else if (group->group_type == AUTOSA_IO_GROUP)
        compute_group_bounds_io_at_node(kernel, group, node_cp, cur_buffer);
      autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
      /* Test if the last dim is increased. */
      cur_last_dim = cur_buffer->tile->bound[cur_buffer->tile->n - 1].size;      
      is_last_dim_equal = isl_val_eq(cur_last_dim, nxt_last_dim);
      isl_schedule_node_free(node_cp);
      if (!is_last_dim_equal)
      {
        break;
      }
      autosa_array_tile_free(cur_buffer->tile);
    }
    if (i == 0)
    {
      /* In this case, none of the second level array part loops helps 
       * increase the burst length. We will allocate the buffer again 
       * at the innermost array_L2 loop and try to hoist up the buffer 
       * to save the communication. 
       */
      int old_depth, new_depth;
      node = isl_schedule_node_child(node, 0);
      if (group->group_type == AUTOSA_DRAIN_GROUP)
        compute_group_bounds_drain_at_node(kernel, group, node, cur_buffer);
      else if (group->group_type == AUTOSA_IO_GROUP)
        compute_group_bounds_io_at_node(kernel, group, node, cur_buffer);
      autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
    }
    isl_schedule_node_free(node);
  }
  /* Test if the buffer position could be further hoisted. */
  old_depth = cur_buffer->tile->depth;
  tile_set_depth(data, cur_buffer->tile);
  new_depth = cur_buffer->tile->depth;
  if (is_last_dim_equal && new_depth == old_depth)
  {
    /* In this case, the buffer couldn't be hosited up, and it doesn't 
     * increase the burst length. 
     * We will test if the last dimension is a multiple of 512 bits (64 bytes).
     */
    cur_last_dim = cur_buffer->tile->bound[cur_buffer->tile->n - 1].size;
    long dim_val = isl_val_get_num_si(cur_last_dim);
    if ((dim_val * group->array->size) % 64 != 0)
    {
      /*There is no benefit to generate the 
       * second-level buffer. We will free up the tile.
       */
      autosa_array_tile_free(cur_buffer->tile);
      cur_buffer->tile = NULL;
    }
  }
  else
  {
    if (new_depth != old_depth)
    {
      isl_multi_aff_free(cur_buffer->tile->tiling);
      autosa_array_ref_group_compute_tiling(cur_buffer->tile, group);
    }
  }

  return isl_stat_ok;
}

/* Return the prefix I/O schedule at io_level "level". */
static __isl_give isl_union_map *get_io_schedule_at_level(
    __isl_keep isl_schedule *sched, int level)
{
  isl_schedule_node *node;
  struct autosa_kernel *kernel;
  isl_id *id;
  isl_union_map *io_sched;

  node = isl_schedule_get_root(sched);
  node = autosa_tree_move_down_to_kernel(node);
  id = isl_schedule_node_mark_get_id(node);
  kernel = (struct autosa_kernel *)isl_id_get_user(id);
  isl_id_free(id);
  node = autosa_tree_move_down_to_io_mark(node, kernel->core, level);
  io_sched = prefix_with_equalities(node);
  io_sched = expand(io_sched, kernel->contraction);
  isl_schedule_node_free(node);

  return io_sched;
}

/* Map the domain of "access" to the outer data->local_depth
 * schedule dimensions.   
 */
static __isl_give isl_map *local_access_io(struct autosa_array_ref_group *group,
                                           __isl_keep isl_union_map *access, struct autosa_group_data *data)
{
  isl_union_map *local;
  local = isl_union_map_copy(access);

  if (group->io_type == AUTOSA_EXT_IO)
  {
    /* Group at the IO_L2 level */
    isl_union_map *new_sched = get_io_schedule_at_level(group->io_schedule, 2);
    local = isl_union_map_apply_domain(local,
                                       new_sched);
  }
  else if (group->io_type == AUTOSA_INT_IO)
  {
    /* Group at the IO_L1 level. */
    isl_union_map *new_sched = get_io_schedule_at_level(group->io_schedule, 1);
    local = isl_union_map_apply_domain(local,
                                       new_sched);
  }
  return isl_map_from_union_map(local);
}

/* Compute the local memory tiles for the array reference group "group"
 * of array "array". Return isl_stat_ok on success and isl_stat_error on error.
 *
 * If the array is a read-only scalar or if the user requested not to use 
 * local emory, then we do not need to do anything.
 *
 * For interior I/O group, the tiling is computed at the io_L1 level.
 * For exteriro I/O group, the tiling is computed at the io_L2 level.
 */
static isl_stat compute_group_bounds_core_io(struct autosa_kernel *kernel,
                                             struct autosa_array_ref_group *group,
                                             struct autosa_group_data *data)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. 
   * TODO: Overapproximation */
  access = autosa_array_ref_group_access_relation(group, 1, 1);
  /* Create local tile */
  if (use_local)
  {
    /* Create a tile. */
    group->local_tile = autosa_array_tile_create(ctx,
                                                 group->array->n_index);
    /* Map the domain to the outer scheduling dimensions. */
    acc = local_access_io(group, access, data);
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->local_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->local_tile =
          autosa_array_tile_free(group->local_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Compute the local memory tiles for the array
 * reference group "group" of array "array" and set the tile depth.
 * Return 0 on success and -1 on error.
 */
static int compute_group_bounds_io(struct autosa_kernel *kernel,
                                   struct autosa_array_ref_group *group,
                                   struct autosa_group_data *data)
{
  if (!group)
    return -1;
  if (compute_group_bounds_core_io(kernel, group, data) < 0)
    return -1;

  return 0;
}

/* Set array->n_group and array->groups to n and groups.
 *
 * Additionally, set the "nr" field of each group.
 */
static void set_array_groups_io(struct autosa_local_array_info *array,
                                int n, struct autosa_array_ref_group **groups)
{
  int i;

  array->n_io_group = n;
  array->io_groups = groups;

  for (i = 0; i < n; ++i)
    groups[i]->nr = i;
}

/* Group array references together if they share the I/O modules.
 * Return -1 on error.
 *
 * Two array references are grouped together if they share:
 * - I/O direction "dir" 
 * - I/O type "io_type"
 * Besides, they should all under the SIMD loop or not.
 *
 * For exterior I/O pair, calculate the group tiling at the io_L2 level.
 * For interior I/O pair, calculate the group tiling at the io_L1 level.
 */
static int group_array_references_io(struct autosa_kernel *kernel,
                                     struct autosa_local_array_info *local, struct autosa_group_data *data)
{
  int i, j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);
  struct autosa_array_ref_group **groups;

  /* Count the total number of groups. 
   * We first populate the groups with the number of total communication pairs 
   * (io_info).
   * We only consider io_info with RAR/RAW for IO groups.
   */
  n = 0;
  for (i = 0; i < local->array->n_ref; i++)
  {    
    struct autosa_stmt_access *ref = local->array->refs[i];
    for (j = 0; j < ref->n_io_info; j++) {
      struct autosa_io_info *io_info = ref->io_info[j];
      if (io_info->dep->type == AUTOSA_DEP_RAW || io_info->dep->type == AUTOSA_DEP_RAR)
        n++;      
    }    
  }

  groups = (struct autosa_array_ref_group **)calloc(n,
                                                    sizeof(struct autosa_array_ref_group *));
  //groups = new autosa_array_ref_group*[n];
  if (!groups)
    return -1;

  /* Populate the groups. */
  n = populate_array_references_io(local, groups, data);

  /* Group references that share the same I/O direction and I/O type. */
  n = group_share_io(kernel, n, groups, data);

  /* Perform interior I/O elimination. */
  for (i = 0; i < n; ++i)
  {
    autosa_interior_io_eliminate(kernel, groups[i], data->gen, data);
  }

  set_array_groups_io(local, n, groups);

  return 0;
}

/* Internal struct usedd for extract_access_waw_domain */
struct extract_access_waw_domain_data
{
  struct autosa_stmt_access *ref;
  isl_set *drain_domain;
};

/* Check if the access is associated with the waw,
 * if so, calculate the write-out (drain) domain as:
 * acc domain - waw src_domain
 */
static void extract_access_waw_domain(__isl_keep isl_basic_map *dep, void *user)
{
  isl_space *space;
  isl_space *src_space;
  isl_id *src_id;
  isl_set *src_domain;
  struct extract_access_waw_domain_data *data =
      (struct extract_access_waw_domain_data *)(user);
  isl_basic_map *bmap;
  isl_map *map;

  space = isl_basic_map_get_space(dep);
  src_space = isl_space_unwrap(isl_space_domain(space));
  src_id = isl_space_get_tuple_id(src_space, isl_dim_out);
  isl_space_free(src_space);

  if (src_id != data->ref->ref_id)
  {
    isl_id_free(src_id);
    return;
  }
  isl_id_free(src_id);

  bmap = isl_basic_map_copy(dep);
  map = isl_map_from_basic_map(bmap);
  map = isl_map_factor_domain(map);
  src_domain = isl_map_domain(map);

  data->drain_domain = isl_set_subtract(data->drain_domain, src_domain);

  return;
}

/* Extract the write-out domain for the given access. */
static isl_bool extract_access_waw_domain_wrap(__isl_keep isl_map *map, void *user)
{
  isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(map);
  for (int i = 0; i < isl_map_n_basic_map(map); i++)
  {
    isl_basic_map *dep = isl_basic_map_list_get_basic_map(bmap_list, i);
    extract_access_waw_domain(dep, user);
    isl_basic_map_free(dep);
  }
  isl_basic_map_list_free(bmap_list);
  return isl_bool_true;
}

/* Compute the local memory tiles for the array reference group "group"
 * of array "array". Return isl_stat_ok on success and isl_stat_error on error.
 *
 * The tiling is computed at the PE level.
 */
static isl_stat compute_group_bounds_core_drain(struct autosa_kernel *kernel,
                                                struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  isl_ctx *ctx = isl_space_get_ctx(group->array->space);
  int use_local = kernel->options->autosa->use_local_memory;
  isl_stat r = isl_stat_ok;
  isl_union_map *access;
  isl_map *acc;
  isl_bool ok;

  if (!use_local)
    return isl_stat_ok;
  if (autosa_array_is_read_only_scalar(group->array))
    return isl_stat_ok;
  if (!group->exact_write)
    return isl_stat_ok;
  if (group->slice)
    return isl_stat_ok;

  /* Collect all accesses in the group. */
  /* This is overapproximated. */
  access = autosa_array_ref_group_access_relation(group, 0, 1);
  /* Create local tile */
  if (use_local)
  {
    /* Create a tile. */
    group->local_tile = autosa_array_tile_create(ctx,
                                                 group->array->n_index);
    /* Map the domain to the outer scheduling dimensions */
    acc = local_access_io(group, access, data);
    /* Collect the shift and scale factors of the tile. */
    ok = can_tile(acc, group->local_tile);
    if (ok < 0)
      r = isl_stat_error;
    else if (!ok)
      group->local_tile =
          autosa_array_tile_free(group->local_tile);
    isl_map_free(acc);
  }

  if (r < 0)
  {
    isl_union_map_free(access);
    return r;
  }

  isl_union_map_free(access);
  return isl_stat_ok;
}

/* Compute the local memory tiles for the array
 * reference group "group" of array "array" and set the tile depth.
 * Return 0 on success and -1 on error.
 */
static int compute_group_bounds_drain(struct autosa_kernel *kernel,
                                      struct autosa_array_ref_group *group, struct autosa_group_data *data)
{
  if (!group)
    return -1;
  if (compute_group_bounds_core_drain(kernel, group, data) < 0)
    return -1;

  return 0;
}

/* Group array references together if they are associated with WAW dep and need 
 * to be drained out.
 * Return -1 on error.
 *
 * Calculate the group tiling at the PE level.
 */
static int group_array_references_drain(struct autosa_kernel *kernel,
                                        struct autosa_local_array_info *local, struct autosa_group_data *data)
{
  local->drain_group = NULL;
  if (local->array->local)
    return 0;

  int i, j;
  int n;
  isl_ctx *ctx = isl_union_map_get_ctx(data->pe_sched);
  struct autosa_array_ref_group **groups = NULL;
  isl_union_map *dep_waw = kernel->scop->tagged_dep_waw;  

  /* Populate the groups. */
  n = 0;
  for (int i = 0; i < local->array->n_ref; ++i)
  {
    struct autosa_stmt_access *access = local->array->refs[i];    
    if (!access->write)
      continue;
    isl_set *domain = isl_map_domain(isl_map_copy(access->access));
    isl_set *access_domain = isl_union_set_extract_set(
        kernel->expanded_domain,
        isl_set_get_space(domain));
    isl_set_free(domain);
    
    struct extract_access_waw_domain_data drain_data = {access, access_domain};
    isl_union_map_every_map(dep_waw, &extract_access_waw_domain_wrap, &drain_data);    
    if (!isl_set_is_empty(drain_data.drain_domain))
    {
      isl_map *map;
      isl_union_map *umap;

      map = isl_map_copy(access->access);
      umap = isl_union_map_from_map(map);
      umap = isl_union_map_apply_domain(umap,
                                        isl_union_map_copy(data->pe_sched));

      map = isl_map_from_union_map(umap);
      map = isl_map_detect_equalities(map);

      /* Add this access relation to the group. */
      //struct autosa_array_ref_group *group =
      //    isl_calloc_type(ctx, struct autosa_array_ref_group);
      struct autosa_array_ref_group *group = new autosa_array_ref_group;
      group = autosa_array_ref_group_init(group);
      if (!group)
      {
        isl_map_free(map);
        isl_set_free(drain_data.drain_domain);
        return -1;
      }

      group->local_array = local;
      group->array = local->array;
      group->access = map;
      group->write = access->write;
      group->exact_write = access->exact_write;
      group->slice = access->n_index < local->array->n_index;
      group->refs = &local->array->refs[i];
      group->n_ref = 1;
      group->io_type = AUTOSA_INT_IO;
      group->dir = isl_vec_zero(ctx, kernel->n_sa_dim);
      group->old_dir = isl_vec_zero(ctx, kernel->n_sa_dim);
      /* Perform interior I/O elimination by default. */
      if (kernel->options->autosa->int_io_dir == 0)
        group->dir = isl_vec_set_element_si(group->dir, 0, 1);
      else
        group->dir = isl_vec_set_element_si(group->dir, isl_vec_size(group->dir) - 1, 1);
      group->group_type = AUTOSA_DRAIN_GROUP;
      group->pe_io_dir = IO_OUT;
      group->array_io_dir = IO_OUT;
      group->io_pe_expr = NULL;
      group->io_L1_pe_expr = NULL;
      group->n_io_buffer = 0;
      group->io_buffers = NULL;
      group->copy_schedule = NULL;
      group->pe_tile = NULL;
      group->local_tile = NULL;
      group->n_mem_ports = 1;
      group->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(local->array->tuning_refs[i]));
      group->tuning_pe_tile = NULL;

      //groups = (struct autosa_array_ref_group **)realloc(groups, (++n) *
      //                                                               sizeof(struct autosa_array_ref_group *));      
      struct autosa_array_ref_group **groups_tmp = isl_calloc_array(ctx, struct autosa_array_ref_group *, ++n);
      for (int g = 0; g < n - 1; g++) {
        groups_tmp[g] = groups[g];
      }
      free(groups);
      groups = groups_tmp;      
      groups[n - 1] = group;
    }
    isl_set_free(drain_data.drain_domain);
  }

  /* Join all referneces together. */
  for (i = 1; i < n; ++i)
  {
    groups[0] = join_groups_and_free(groups[0], groups[i]);
  }
  if (n > 1)
    n = 1;

  /* Set the group. */
  if (n > 0)
  {
    groups[0]->nr = 0;
    local->drain_group = groups[0];
  }
  else
  {
    local->drain_group = NULL;
  }
  free(groups);

  return 0;
}

static int gcd(int n1, int n2)
{
  while (n1 != n2)
  {
    if (n1 > n2)
      n1 -= n2;
    else
      n2 -= n1;
  }

  return n1;
}

/* Compute a tiling for all the array reference groups in "kernel".
 */
static void compute_group_tilings_pe(struct autosa_kernel *kernel)
{
  int i, j;

  for (i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];

    for (j = 0; j < array->n_pe_group; ++j)
      autosa_array_ref_group_compute_tiling(NULL, array->pe_groups[j]);
  }
}

/* Compute a tiling for all the array reference groups in "kernel".
 */
static void compute_group_tilings_io(struct autosa_kernel *kernel)
{
  int i, j;

  for (i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];

    for (j = 0; j < array->n_io_group; ++j)
      autosa_array_ref_group_compute_tiling(NULL, array->io_groups[j]);
  }
}

/* Compute a tiling for all the array reference groups in "kernel".
 */
static void compute_group_tilings_drain(struct autosa_kernel *kernel)
{
  int i, j;

  for (i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];
    if (!array->drain_group)
      continue;
    autosa_array_ref_group_compute_tiling(NULL, array->drain_group);
  }
}

/* Update the I/O schedules by I/O module clustering. */
static isl_stat autosa_io_clustering(struct autosa_kernel *kernel,
                                     struct autosa_gen *gen, struct autosa_group_data *data)
{
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++)
    {
      compute_io_group_schedule(kernel, local->io_groups[j], gen);
    }
    if (local->drain_group)
    {
      compute_io_group_schedule(kernel, local->drain_group, gen);
    }
  }
  return isl_stat_ok;
}

/* Allocate I/O buffers inside I/O modules. */
static isl_stat autosa_io_buffer_allocate(struct autosa_kernel *kernel,
                                          struct autosa_gen *gen, struct autosa_group_data *data)
{
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++)
    {      
      compute_io_group_buffer(kernel, local->io_groups[j], gen);            
      if (!gen->options->autosa->lower_int_io_L1_buffer) {
        /* Hoist the L1 I/O buffer. 
         * Do not touch internal array when local reduce is enabled.
         */
        if (!(gen->options->autosa->local_reduce && local->array_type == AUTOSA_INT_ARRAY)) {
          if (kernel->array_part_w > 0)
            hoist_L1_io_buffer(kernel, local->io_groups[j], gen, data);
        }
      }      
      if (gen->options->autosa->two_level_buffer)
      {
        /* Seek the opportunity to hoist up the L2 I/O buffers. */
        hoist_L2_io_buffer(kernel, local->io_groups[j], gen, data);
      }            
      if (gen->options->autosa->local_reduce && local->io_groups[j]->attached_drain_group)
      {
        if (gen->options->autosa->two_level_buffer) {
          /* At present, two-level buffer and local reduce can be enabled at the same time. */
          throw std::runtime_error("[AutoSA] Error: Two-level buffer and local reduce can't be used at the same time.");
        }        
      }      
      if (gen->options->autosa->lower_int_io_L1_buffer) {
        /* Lower the L1 buffer for interior I/O module if possible. */
        lower_int_io_L1_buffer(kernel, local->io_groups[j], gen);
        /* Enable the second-level buffer for this array */
        insert_L2_io_buffer(kernel, local->io_groups[j], gen);
        /* Seek the opportunity to hoist up the L2 I/O buffers. */
        //hoist_L2_io_buffer(kernel, local->io_groups[j], gen, data);
      }            
    }    
    if (local->drain_group)
    {      
      compute_io_group_buffer(kernel, local->drain_group, gen);
      if (gen->options->autosa->two_level_buffer)
      {
        hoist_L2_io_buffer(kernel, local->drain_group, gen, data);
      }
    }
  }
  return isl_stat_ok;
}

/* Compute data packing factors. */
static isl_stat autosa_io_data_pack(struct autosa_kernel *kernel,
                                    struct autosa_gen *gen, struct autosa_group_data *data)
{
  /* Initalize the IO buffer */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++) {
      struct autosa_array_ref_group *group = local->io_groups[j];
      //if (group->copy_in || group->copy_out) {
        for (int k = 0; k < group->io_level; k++) {
          struct autosa_io_buffer *buf = group->io_buffers[k];
          buf->sparse = 0;
          buf->vec_len = 0;        
          buf->serialize = (gen->options->autosa->host_serialize == 1)? 1 : 0;
        }      
      //}
    }
    if (local->drain_group) {
      struct autosa_array_ref_group *group = local->drain_group;
      for (int k = 0; k < group->io_level; k++) {
        struct autosa_io_buffer *buf = group->io_buffers[k];
        buf->sparse = 0;
        buf->vec_len = 0;        
        buf->serialize = (gen->options->autosa->host_serialize == 1)? 1 : 0;
      }      
    }
  }

  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++) {
      //if (local->io_groups[j]->copy_in || local->io_groups[j]->copy_out) {
        if (local->is_sparse)
          compute_io_group_data_pack_sparse(kernel, local->io_groups[j], gen, -1);
        else
          compute_io_group_data_pack(kernel, local->io_groups[j], gen, -1);
      //}
    }
    if (local->drain_group) {
      if (local->is_sparse)
        compute_io_group_data_pack_sparse(kernel, local->drain_group, gen, -1);
      else
        compute_io_group_data_pack(kernel, local->drain_group, gen, -1);
    }
  }
  return isl_stat_ok;
}

/* Construct a map from domain_space to domain_space that increments
 * the dimension at position "pos" and leaves all other dimensions constant. 
 */
static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos)
{
  isl_space *space;
  isl_aff *aff;
  isl_multi_aff *next;

  space = isl_space_map_from_set(domain_space);
  next = isl_multi_aff_identity(space);
  aff = isl_multi_aff_get_aff(next, pos);
  aff = isl_aff_add_constant_si(aff, 1);
  next = isl_multi_aff_set_aff(next, pos, aff);

  return isl_map_from_multi_aff(next);
}

/* This function generates different possible loop orderings for the array partitioning loop band.
 * For I/O groups with external array, we will select the loops that do not appear in the 
 * array indices, and select them as the innermost loop in the generated loop ordering.
 * There are several considerations here.
 * 1. Why consider loop index, but not data dependence?
 * For external groups, we don't handle overlapping reuse. For example, when the 
 * reuse factor is (1,-1). In the next iteration, we will only load part of the data
 * and reuse some data left in the previous iteration. However, this brings additional 
 * hardware overheads for indexing the new data and rearranging the old data. 
 * Therefore, such overlapping reuse is not considered. In other words, only reuse 
 * vectors that are in the form of unit vectors are considered. 
 * Therefore, we will only look for loop indices not showing the array index.
 * 2. Why put them innermost?
 * Placing reuse loops innermost maximizes the locality and minimizes the data communication.
 * The relative order between reuse loops and non-reuse loops doesn't matter as overlapped reuse 
 * is not supported. Therefore, we will only randomly pick one loop order for this group.
 * 
 * For I/O groups with internal array, simply, we choose to examine the array indexs.
 * And select the loops that doesn't appear in these indices. This is due to the same 
 * reason as argued above for the simplification of the architecture.
 * 
 * Drain I/O groups are not considered as the data communication volumn is fixed and 
 * is not affected by the loop permutation.
 */
static void explore_loop_permute(struct autosa_kernel *kernel, struct autosa_gen *gen) {
  /* Count the number of possible loop permutation. */
  int n_order = 0;
  std::vector<std::unordered_set<int>> loop_orderings;
  isl_schedule_node *node = isl_schedule_get_root(kernel->schedule);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  isl_union_map *prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  isl_schedule_node_free(node);
  
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++) {
      struct autosa_array_ref_group *group = local->io_groups[j];      
      for (int r = 0; r < group->n_ref; r++) {
        struct autosa_stmt_access *ref = group->refs[r];
        isl_map *acc = isl_map_from_union_map(isl_union_map_apply_domain(
                          isl_union_map_from_map(isl_map_copy(ref->access)),
                          isl_union_map_copy(prefix)));
        int n_dim = isl_map_dim(acc, isl_dim_in);
        std::unordered_set<int> reuse_loops;
        for (int d = 0; d < n_dim; d++) {
          /* We will test if the array elements accessed by the iterations that increased 
           * at position "d" is the same as the original array elements.
           */
          isl_space *space = isl_map_get_space(acc);
          space = isl_space_domain(space);
          isl_map *next_iter = next(space, d);                    
          isl_map *map = isl_map_apply_domain(next_iter, isl_map_copy(acc));
          map = isl_map_apply_range(map, isl_map_copy(acc));
          map = isl_map_coalesce(map);          
          isl_set *domain = isl_map_domain(isl_map_copy(map));
          isl_set *range = isl_map_range(isl_map_copy(map));          
          isl_map_free(map);
          if (isl_set_is_subset(domain, range) && isl_set_is_subset(range, domain)) {            
            //std::cout << d << std::endl;
            reuse_loops.insert(d);
          }          
          isl_set_free(domain);
          isl_set_free(range);
        }        
        isl_map_free(acc);
        if (reuse_loops.size() > 0) {
          // Prune the duplicated ordering.
          int d = 0;
          for (d = 0; d < loop_orderings.size(); d++) {
            if (loop_orderings[d] == reuse_loops) 
              break;
          }
          if (d == loop_orderings.size())
            loop_orderings.push_back(reuse_loops);
        }
      }
    }
  }  
  isl_union_map_free(prefix);
  n_order = loop_orderings.size();

  /* When there is more than one loop permutation found,
   * We will print a temp file named by the sequence of the next loop ordering to the 
   * temporary directory. For example, when there are two orderings, 
   * in the first-time compilation, we print a file named "permute_1" to the tmp directory.
   * AutoSA wrapper script will then call the compilation again serving this index 
   * as the next loop ordering to be selected.
   * This process is iterated until all the orderings are explored.
   * In the last ordering, we print "permute_done" instead, which will instruct the 
   * wrapper script to stop calling the program.
   */
  if (n_order == 1)
    return;

  int cur_n_order = gen->options->autosa->loop_permute_order;
  if (gen->options->autosa->tuning_method == 1) {
    /* Print the tmp file. */  
    isl_printer *p_str = isl_printer_to_str(gen->ctx);
    p_str = isl_printer_print_str(p_str, gen->options->autosa->output_dir);
    p_str = isl_printer_print_str(p_str, "/permute_");
    if (cur_n_order == n_order - 1) {
      p_str = isl_printer_print_str(p_str, "done");
    } else {
      p_str = isl_printer_print_int(p_str, cur_n_order + 1);
    }
    char *file_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    FILE *fp = fopen(file_name, "w");
    fclose(fp);    

    kernel->tuning_program->id2 = cur_n_order;
  }

  /* Modify the loop ordering. */
  std::unordered_set<int> order = loop_orderings[cur_n_order];
  node = isl_schedule_get_root(kernel->schedule);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  node = isl_schedule_node_parent(node);
  int n_dim = isl_schedule_node_band_n_member(node);
  std::unordered_map<int, int> pos_map;
  for (int p = 0; p < n_dim; p++) {
    pos_map[p] = p;
  }
  int n_processed = 0;
  for (auto o : order) {
    //std::cout << o << std::endl;
    /* Move the "o"-th loop inside */    
    node = loop_interchange_at_node(node, pos_map[o], n_dim - 1 - n_processed);
    pos_map[n_dim - 1 - n_processed] = pos_map[o];
    pos_map[o] = n_dim - 1 - n_processed;
    n_processed++;
  }
  //DBGSCHDNODE(stdout, node, gen->ctx);
  isl_schedule_free(kernel->schedule);
  kernel->schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);

  // TODO: Test if we need to update anything else
}

/* Group references of all arrays in "kernel".
 * Each array is associated with three types of groups:
 * PE group: Assign the local buffers inside PEs.
 * I/O group: Assign the I/O modules for transferring data between
 *   PEs and the external memory
 * Drain group: Assign the I/O modules for transferring out the results from
 *   PEs to the external memory.
 */
isl_stat sa_io_construct_optimize(struct autosa_kernel *kernel, struct autosa_gen *gen)
{
  int r = 0;
  struct autosa_group_data data;
  isl_schedule_node *node;
  isl_union_pw_multi_aff *contraction;

  node = isl_schedule_get_root(kernel->schedule);
  node = autosa_tree_move_down_to_kernel(node);

  /* Set autosa_group_data. */
  data.scop = kernel->prog->scop;
  data.gen = gen;
  data.kernel_depth = isl_schedule_node_get_schedule_depth(node);
  data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node);
  node = autosa_tree_move_down_to_pe(node, kernel->core);
  data.pe_depth = isl_schedule_node_get_schedule_depth(node);
  data.pe_sched = prefix_with_equalities(node);
  contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
  data.host_sched = expand(data.host_sched, contraction);
  data.copy_sched = isl_union_map_copy(data.pe_sched);
  data.pe_sched = expand(data.pe_sched, contraction);
  isl_union_pw_multi_aff_free(contraction);
  data.full_sched = isl_union_map_copy(data.pe_sched);
  data.full_sched = isl_union_map_flat_range_product(data.full_sched,
                                                     isl_schedule_node_get_subtree_schedule_union_map(node));
  data.schedule = kernel->schedule;

  /* Create the default array reference groups (PPCG heritage). */
  for (int i = 0; i < kernel->n_array; i++)
  {
    r = group_array_references_default(kernel, &kernel->array[i], &data);
    if (r < 0)
      break;
  }

  /* Group the array references for the PE.
   * These groups will be used for allocate local buffers inside PEs.
   */
  for (int i = 0; i < kernel->n_array; i++)
  {
    r = group_array_references_pe(kernel, &kernel->array[i], &data);
    if (r < 0)
      break;
  }

  /* Group the array references for the I/O modules. */
  for (int i = 0; i < kernel->n_array; i++)
  {
    r = group_array_references_io(kernel, &kernel->array[i], &data);
    if (r < 0)
      break;
  }

  /* Group the array references for the drain data */
  for (int i = 0; i < kernel->n_array; i++)
  {
    r = group_array_references_drain(kernel, &kernel->array[i], &data);
    if (r < 0)
      break;
  }

  if (kernel->scop->options->autosa->explore_loop_permute == 1) {
      /* Explore different loop orderings of the array partitioning band. */
    explore_loop_permute(kernel, gen);
  }

  /* Perform I/O Optimization */  
  /* I/O module clustering */
  autosa_io_clustering(kernel, gen, &data);

  /* Local reduce */
  if (gen->options->autosa->local_reduce) 
  {
    printf("[AutoSA] Warning: Local reduction is enabled. The legality should be guaranteed by users.\n");
    // TODO: In the future, add a legality check for this optimization.
    /* Check if there is one exterior I/O group.
     * Then attach the drain group to this I/O group and set the drain group to NULL.
     */
    for (int i = 0; i < kernel->n_array; i++)
    {
      int ext_group_cnt = 0;
      int group_id = -1;
      struct autosa_local_array_info *local = &kernel->array[i];
      for (int j = 0; j < local->n_io_group; j++)
      {
        if (local->io_groups[j]->io_type == AUTOSA_EXT_IO &&
            local->array_type == AUTOSA_INT_ARRAY) {
          ext_group_cnt++;
          group_id = j;
        }
      }
      if (local->drain_group && ext_group_cnt == 1) {
        local->io_groups[group_id]->attached_drain_group = local->drain_group;
        local->io_groups[group_id]->copy_out = 1;
        local->drain_group = NULL;
        local->io_groups[group_id]->copy_in = 0;
        local->n_mem_ports = 1;
      }    
    }
  }

  if (gen->options->autosa->host_serialize)
  {
    /* Check if there is only one I/O/drain group for each array.
     * Otherwise, we will disable the host serialize.
     */
    for (int i = 0; i < kernel->n_array; i++)
    {
      int module_cnt = 0;
      struct autosa_local_array_info *local = &kernel->array[i];
      for (int j = 0; j < local->n_io_group; j++)
      {
        if (local->io_groups[j]->copy_in)
          module_cnt++;
        if (local->io_groups[j]->copy_out)
          module_cnt++;
      }
      if (local->drain_group)
      {
        if (local->drain_group->copy_out)
          module_cnt++;
      }
      if (module_cnt > 1) {
        gen->options->autosa->host_serialize = 0;
        // TODO: In the future, we should separate this check for each array.
        printf("[AutoSA] Warning: More than one IO/drain group found for array: %s. Host data serialization is disabled.\n", local->array->name);
      }
    }
  }
  if (gen->options->autosa->host_serialize)
  {
    /* Disable the two-level buffering when host data serialization is enabled. */
    gen->options->autosa->two_level_buffer = 0;
    printf("[AutoSA] Warning: Two-level buffering is disabled because host data serialization is enabled.\n");
  }
  if (gen->options->autosa->host_serialize && gen->options->autosa->hbm)
  {
    printf("[AutoSA] Error: Host serialization and HBM can't be enabled at the same time!\n");
    exit(1);
  }

  /* Print the IO grouping information */
  print_io_grouping_info(stdout, kernel);

  /* Test if there is any IO group with internal array and needs copy-in. 
   * Such designs can't run due to the HLS limitation. 
   * Code generation will proceeed as usual only for tuning purpose.
   */
  bool is_safe = true;
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    for (int j = 0; j < local->n_io_group; j++) {
      struct autosa_array_ref_group *group = local->io_groups[j];
      if (group->copy_in && local->array_type == AUTOSA_INT_ARRAY) {
        is_safe = false;
      }
    }
  }
  if (!is_safe) {
    printf("[AutoSA] Warning: The generated program contains feedback loops and can't be synthesized by HLS.\n");
    printf("                  The compilation flow will proceed as usual.\n");
  }
    
  /* I/O buffer allocation */
  autosa_io_buffer_allocate(kernel, gen, &data);  
  /* I/O module data pack */
  autosa_io_data_pack(kernel, gen, &data);    

  /* Since different I/O groups of the same array will access the DRAM with the 
   * same global array pointer. We will need to make sure the outermost 
   * data packing factors are the same across these groups.
   * Here we will examine if they are the same.
   * If not, we will need to repack to the I/O groups to make them equal. 
   */
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    int n_lane = -1;
    bool repack = false;
    for (int j = 0; j < local_array->n_io_group; j++)
    {      
      struct autosa_array_ref_group *group = local_array->io_groups[j];
      if (!(group->copy_in || group->copy_out))
        continue;
      int cur_n_lane = group->io_buffers[group->n_io_buffer - 1]->n_lane;
      if (n_lane == -1)
        n_lane = cur_n_lane;
      else
        n_lane = gcd(n_lane, cur_n_lane);
      if (n_lane != cur_n_lane)
      {
        repack = true;
      }
    }
    if (local_array->drain_group)
    {      
      struct autosa_array_ref_group *group = local_array->drain_group;
      int cur_n_lane = group->io_buffers[group->n_io_buffer - 1]->n_lane;
      if (n_lane == -1)
        n_lane = cur_n_lane;
      else
        n_lane = gcd(n_lane, cur_n_lane);
      if (n_lane != cur_n_lane)
      {
        repack = true;
      }
    }

    if (repack)
    {
      /* We need to repack the data for each I/O buffers */
      for (int j = 0; j < local_array->n_io_group; j++)
      {
        struct autosa_array_ref_group *group = local_array->io_groups[j];
        compute_io_group_data_pack(kernel, group, gen, n_lane);
      }
      if (local_array->drain_group)
      {
        struct autosa_array_ref_group *group = local_array->drain_group;
        compute_io_group_data_pack(kernel, group, gen, n_lane);
      }
    }

    local_array->n_lane = std::max(1, n_lane);
    local_array->array->n_lane = std::max(1, n_lane);
  }

  isl_union_map_free(data.host_sched);
  isl_union_map_free(data.copy_sched);
  isl_union_map_free(data.full_sched);
  isl_union_map_free(data.pe_sched);
  isl_schedule_node_free(node);

  /* Compute a tiling for all the array reference groups in "kernel". */
  compute_group_tilings_pe(kernel);
  compute_group_tilings_io(kernel);
  compute_group_tilings_drain(kernel);

  return isl_stat_ok;
}

/* Return the access relation associated with the comm pair of the array reference
 * "ref" in the current I/O group "group".
 * For each reference, if 
 * - extract copy-in access (read == 1) 
 *   - read access
 *     - RAR: extract the union of the src and dest domain of dep
 *     - RAW: extract the dest domain of dep
 * - extract copy-out access (write == 1)
 *   - write access
 *     - RAW: extract the src domain of dep 
 */
__isl_give isl_union_map *autosa_io_group_ref_access_relation(
    struct autosa_array_ref_group *group,
    struct autosa_stmt_access *ref,
    int read, int write)
{
  isl_union_map *access;
  isl_map *map;

  access = isl_union_map_empty(isl_map_get_space(ref->access));
  for (int i = 0; i < ref->n_io_info; i++)
  {
    struct autosa_io_info *info_i = ref->io_info[i];
    if (info_i->io_type == group->io_type &&
        !isl_vec_cmp(info_i->dir, group->dir))
    {
      isl_map *dep = isl_map_factor_domain(isl_map_from_basic_map(
          isl_basic_map_copy(info_i->dep->isl_dep)));
      isl_set *dep_src = isl_map_domain(isl_map_copy(dep));
      isl_set *dep_dest = isl_map_range(dep);
      if (info_i->dep->type == AUTOSA_DEP_RAR)
      {
        isl_set *domain = isl_set_union(dep_src, dep_dest);
        domain = isl_set_coalesce(domain);
        access = isl_union_map_union(access,
                                     isl_union_map_from_map(isl_map_intersect_domain(
                                         isl_map_copy(ref->access), domain)));
      }
      else if (info_i->dep->type == AUTOSA_DEP_RAW)
      {
        isl_set *domain;
        if (ref->read)
        {
          domain = dep_dest;
          isl_set_free(dep_src);
        }
        else
        {
          domain = dep_src;
          isl_set_free(dep_dest);
        }
        access = isl_union_map_union(access,
                                     isl_union_map_from_map(isl_map_intersect_domain(
                                         isl_map_copy(ref->access), domain)));
      }
      else
      {
        isl_set_free(dep_src);
        isl_set_free(dep_dest);
      }
    }
  }

  return access;
}

/* Return the access relation associated with the comm pair of the array reference
 * "ref" in the current drain group "group".
 * For each reference, domain = domain - src domain of WAW dep.
 */
__isl_give isl_union_map *autosa_drain_group_ref_access_relation(
    struct autosa_array_ref_group *group,
    struct autosa_stmt_access *ref,
    int read, int write, __isl_keep isl_union_set *domain)
{
  isl_union_map *access;
  isl_set *acc_domain;
  isl_space *space;

  access = isl_union_map_empty(isl_map_get_space(group->access));
  acc_domain = isl_map_domain(isl_map_copy(ref->access));
  space = isl_set_get_space(acc_domain);
  isl_set_free(acc_domain);
  acc_domain = isl_union_set_extract_set(domain, space);
  for (int i = 0; i < ref->n_io_info; i++)
  {
    struct autosa_io_info *info_i = ref->io_info[i];
    if (info_i->dep->type == AUTOSA_DEP_WAW)
    {
      isl_set *src_domain;
      isl_space *space, *src_space;
      isl_id *src_id;

      space = isl_basic_map_get_space(info_i->dep->isl_dep);
      src_space = isl_space_unwrap(isl_space_domain(space));
      src_id = isl_space_get_tuple_id(src_space, isl_dim_out);
      isl_space_free(src_space);
      if (src_id != ref->ref_id)
      {
        isl_id_free(src_id);
        continue;
      }
      isl_id_free(src_id);
      src_domain = isl_map_domain(isl_map_factor_domain(isl_map_from_basic_map(
          isl_basic_map_copy(info_i->dep->isl_dep))));
      acc_domain = isl_set_subtract(acc_domain, src_domain);
    }
  }
  access = isl_union_map_union(access,
                               isl_union_map_from_map(isl_map_intersect_domain(
                                   isl_map_copy(ref->access), acc_domain)));

  return access;
}

/* For each reference, if 
 * - extract copy-in access (read == 1) 
 *   - read access
 *     - RAR: extract the union of the src and dest domain of dep
 *     - RAW: extract the dest domain of dep
 * - extract copy-out access (write == 1)
 *   - write access
 *     - RAW: extract the src domain of dep 
 */
__isl_give isl_union_map *autosa_io_group_access_relation(
  struct autosa_array_ref_group *group, 
  struct autosa_kernel *kernel,
  int read, int write)
{
  isl_union_map *access;

  access = isl_union_map_empty(isl_map_get_space(group->access));
  for (int i = 0; i < group->n_ref; ++i)
  {
    struct autosa_stmt_access *ref_i = group->refs[i];

    if (!((read && group->refs[i]->read) ||
          (write && group->refs[i]->write)))
      continue;

    if (group->group_type == AUTOSA_IO_GROUP) 
    {
      access = isl_union_map_union(access,
                                   autosa_io_group_ref_access_relation(group, ref_i, read, write));
    } else if (group->group_type == AUTOSA_DRAIN_GROUP) 
    {
      access = isl_union_map_union(access,
                                   autosa_drain_group_ref_access_relation(group, ref_i, read, write,
                                                                          kernel->expanded_domain));
    }
  }

  /* Simplify the access relation. */
  access = isl_union_map_coalesce(access);

  return access;
}

/* Return the union of all tagged access relations in the group.
 */
__isl_give isl_union_map *group_tagged_access_relation(
    struct autosa_array_ref_group *group)
{
  int i;
  isl_union_map *access;

  access = isl_union_map_empty(isl_map_get_space(group->access));
  for (i = 0; i < group->n_ref; ++i)
  {
    isl_map *map_i;

    map_i = isl_map_copy(group->refs[i]->tagged_access);
    access = isl_union_map_union(access,
                                 isl_union_map_from_map(map_i));
  }

  return access;
}

/* Given a set of wrapped references "ref", return the corresponding
 * access relations based on the tagged access relations "tagged".
 *
 * The elements of "ref" are of the form
 *
 *	[D -> R]
 *
 * with D an iteration domains and R a reference.
 * The elements of "tagged" are of the form
 *
 *	[D -> R] -> A
 *
 * with A an array.
 *
 * Extend "tagged" to include the iteration domain in the range, i.e.,
 *
 *	[D -> R] -> [D -> A]
 *
 * apply the result to "ref" and then unwrap the resulting set
 * to obtain relations of the form
 *
 *	D -> A
 */
__isl_give isl_union_map *wrapped_reference_to_access(
    __isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
{
  isl_union_map *tag2access;

  tag2access = isl_union_map_copy(tagged);
  tag2access = isl_union_map_universe(tag2access);
  tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));

  /* Construct [D -> R] -> D */
  tag2access = isl_union_map_domain_map(tag2access);

  /* Construct [D -> R] -> [D -> A] */
  tag2access = isl_union_map_range_product(tag2access, tagged);

  ref = isl_union_set_coalesce(ref);
  ref = isl_union_set_apply(ref, tag2access);

  return isl_union_set_unwrap(ref);
}

/* Given an access relation "access" from one or more array reference groups,
 * remove those reads if ("read" is 1) or writes (if "read" is 0)
 * that are only needed to communicate data within
 * the same iteration of "sched".
 * The domain of "sched" corresponds to the original statement instances,
 * i.e., those that appear in the domains of the access relations.
 * "tagged" contains all tagged access relations to all
 * the array reference groups accessed by "access" from statement
 * instances scheduled by "sched".
 *
 * If the access is a read then it is either an element of
 *
 *	live_in union (range flow)
 *
 * where live_in and flow may be overapproximations, or
 * it reads an uninitialized value (that is not live-in because
 * there is an intermediate kill) or it reads a value that was
 * written within the same (compound) statement instance.
 * If the access is a write then it is either an element of
 *
 *	live_out union (domain flow)
 *
 * or it writes a value that is never read (and is not live-out
 * because of an intermediate kill) or only
 * within the same (compound) statement instance.
 * In both cases, the access relation is also a subset of
 * the group access relation.
 *
 * The cases where an uninitialized value is read or a value is written
 * that is never read or where the dataflow occurs within a statement
 * instance are also considered local and may also be removed.
 *
 * Essentially, we compute the intersection of "access" with either
 *
 *	live_in union (range non-local-flow)
 *
 * or
 *
 *	live_out union (domain non-local-flow)
 *
 * We first construct a relation "local"
 *
 *	[[D -> R] -> [D' -> R']]
 *
 * of pairs of domain iterations accessing the reference group
 * and references in the group that are coscheduled by "sched".
 *
 * If this relation does not intersect the dataflow dependences,
 * then there is nothing we can possibly remove, unless the dataflow
 * dependences themselves only relate a subset of the accesses.
 * In particular, the accesses may not be involved in any dataflow
 * dependences, either because they are uninitialized reads/dead writes
 * or because the dataflow occurs inside a statement instance.
 *
 * Since the computation below may break up the access relation
 * into smaller pieces, we only perform the intersection with
 * the non-local dependent accesses if the local pairs
 * intersect the dataflow dependences. Otherwise, we intersect
 * with the universe of the non-local dependent accesses.
 * This should at least remove accesses from statements that
 * do not participate in any dependences.
 *
 * In particular, we remove the "local" dataflow dependences from
 * the set of all dataflow dependences, or at least those
 * that may contribute to a domain/range that intersects
 * the domain of "access".
 * Note that if the potential dataflow dependences are an overapproximation
 * of the actual dataflow dependences, then the result remains an
 * overapproximation of the non-local dataflow dependences.
 * Copying to/from global memory is only needed for the references
 * in the domain/range of the result or for accesses that are live out/in
 * for the entire scop.
 *
 * We therefore map the domain/range of the "external" relation
 * to the corresponding access relation and take the union with
 * the live out/in relation.
 */
__isl_give isl_union_map *remove_local_accesses(
    struct autosa_prog *prog, __isl_take isl_union_map *tagged,
    __isl_take isl_union_map *access, __isl_take isl_union_map *sched,
    int read)
{
  int empty;
  isl_union_pw_multi_aff *tagger;
  isl_union_set *domain, *access_domain;
  isl_union_map *local, *external, *universe;
  isl_union_set *tag_set;

  if (isl_union_map_is_empty(access))
  {
    isl_union_map_free(sched);
    isl_union_map_free(tagged);
    return access;
  }

  /* Tagger maps the tagged iteration domain to untagged iteration domain. 
   * Iteration domain is tagged to the access function.
   * e.g., [S1[i,j,k]->_pet_ref_1[]] -> S1[(i),(j),(k)]
   */
  tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
  domain = isl_union_map_domain(isl_union_map_copy(tagged));
  tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
                                                   isl_union_set_copy(domain));
  sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger);

  /* Construct the relation "local"
   * [[D -> R] -> [D' -> R']]
   */
  local = isl_union_map_apply_range(sched,
                                    isl_union_map_reverse(isl_union_map_copy(sched)));
  /* Derive the local dependence set. */
  local = isl_union_map_intersect(local,
                                  isl_union_map_copy(prog->scop->tagged_dep_flow));

  empty = isl_union_map_is_empty(local);

  external = isl_union_map_copy(prog->scop->tagged_dep_flow);
  universe = isl_union_map_universe(isl_union_map_copy(access));
  access_domain = isl_union_map_domain(universe);
  domain = isl_union_set_universe(domain);
  universe = isl_union_set_unwrap(domain);
  universe = isl_union_map_intersect_domain(universe, access_domain);
  domain = isl_union_map_wrap(universe);
  if (read)
    external = isl_union_map_intersect_range(external, domain);
  else
    external = isl_union_map_intersect_domain(external, domain);
  external = isl_union_map_intersect_params(external,
                                            isl_set_copy(prog->scop->context));
  external = isl_union_map_subtract(external, local);
  /* So far external contains only access non-local RAW pairs. */

  if (read)
  {
    tag_set = isl_union_map_range(external);
    external = wrapped_reference_to_access(tag_set, tagged);
    external = isl_union_map_union(external,
                                   isl_union_map_copy(prog->scop->live_in));
  }
  else
  {
    tag_set = isl_union_map_domain(external);
    external = wrapped_reference_to_access(tag_set, tagged);
    external = isl_union_map_union(external,
                                   isl_union_map_copy(prog->scop->live_out));
  }

  if (empty < 0)
    external = isl_union_map_free(external);
  else if (empty)
    external = isl_union_map_universe(external);

  access = isl_union_map_intersect(access, external);

  return access;
}

/* Extended from remove_local_accesses.
 * Given an access relation "access" from one or more array reference groups,
 * remove those reads if ("read" is 1) or writes (if "read" is 0)
 * that are only needed to communicate data within
 * the same iteration of "sched".
 * We exclude the live-in and live-out accesses. 
 * This function only considers RAW deps.
 * 
 * "tagged" contain all tagged accesses in the group.
 * "access" contain the accesses of interest for the current group.
 * "sched" is the prefix schedule. The depth of the scheduling domain is where
 * the copy statemetns are inserted.
 */
__isl_give isl_union_map *remove_local_accesses_flow(
  struct autosa_prog *prog, __isl_take isl_union_map *tagged,
  __isl_take isl_union_map *access, __isl_take isl_union_map *sched,
  int read)
{
  int empty;
  isl_union_pw_multi_aff *tagger;
  isl_union_set *domain, *access_domain;
  isl_union_map *local, *external, *universe;
  isl_union_set *tag_set;

  if (isl_union_map_is_empty(access))
  {
    isl_union_map_free(sched);
    isl_union_map_free(tagged);
    return access;
  }

  /* Tagger maps the tagged iteration domain to untagged iteration domain. 
   * Iteration domain is tagged to the access function.
   * e.g., [S1[i,j,k]->_pet_ref_1[]] -> S1[(i),(j),(k)]
   */
  tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
  domain = isl_union_map_domain(isl_union_map_copy(tagged));
  tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
                                                   isl_union_set_copy(domain));
  sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger);

  /* Construct the relation "local"
   * [[D -> R] -> [D' -> R']]
   * D contains all the iteration domains accessing the elements in the group.
   */
  local = isl_union_map_apply_range(sched,
                                    isl_union_map_reverse(isl_union_map_copy(sched)));
  /* Derive the local dependence set. */
  local = isl_union_map_intersect(local,
                                  isl_union_map_copy(prog->scop->tagged_dep_flow));
  empty = isl_union_map_is_empty(local);

  external = isl_union_map_copy(prog->scop->tagged_dep_flow);
  universe = isl_union_map_universe(isl_union_map_copy(access));
  access_domain = isl_union_map_domain(universe);
  domain = isl_union_set_universe(domain);
  universe = isl_union_set_unwrap(domain);
  universe = isl_union_map_intersect_domain(universe, access_domain);
  domain = isl_union_map_wrap(universe);
  if (read)
    external = isl_union_map_intersect_range(external, domain);
  else
    external = isl_union_map_intersect_domain(external, domain);
  external = isl_union_map_intersect_params(external,
                                            isl_set_copy(prog->scop->context));
  external = isl_union_map_subtract(external, local);
  /* So far "external" contains only iteration elements accessing the 
   * non-local RAW pairs. */

  if (read)
  {
    tag_set = isl_union_map_range(external);
    external = wrapped_reference_to_access(tag_set, tagged);
    //    /* Temporarily commented out, we don't consider live-in so far. */
    //		external = isl_union_map_union(external,
    //				isl_union_map_copy(prog->scop->live_in));
  }
  else
  {
    tag_set = isl_union_map_domain(external);
    external = wrapped_reference_to_access(tag_set, tagged);
    //    /* Temporarily commented out, we don't consider live-out so far. */
    //		external = isl_union_map_union(external,
    //				isl_union_map_copy(prog->scop->live_out));
  }

  if (empty < 0)
    external = isl_union_map_free(external);
  else if (empty)
    external = isl_union_map_universe(external);

  access = isl_union_map_intersect(access, external);

  return access;
}

/* Given an access relation "access" from "group", remove those reads
 * if ("read" is 1) or writes (if "read" is 0) that are only needed to
 * communicate data within the same iteration of the schedule "prefix"
 * at the position where the copying of the group is inserted.
 * That is, the output dimension of "prefix"
 * is equal to tile->depth.
 * The domain of "prefix" corresponds to the original statement instances,
 * i.e., those that appear in the domains of the access relations.
 *
 * Extract the tagged access relation of "group" and
 * then call remove_local_accesses.
 */
__isl_give isl_union_map *remove_local_accesses_group_flow(
  struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
  __isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
  int read)
{
  isl_union_map *sched, *tagged;

  if (isl_union_map_is_empty(access))
    return access;

  tagged = group_tagged_access_relation(group);
  sched = isl_union_map_copy(prefix);

  return remove_local_accesses_flow(kernel->prog, tagged, access, sched, read);
}

/* Given an access relation "access" from "group", remove those reads
 * if ("read" is 1) or writes (if "read" is 0) that are only needed to
 * communicate data within the same iteration of the schedule "prefix"
 * at the position where the copying of the group is inserted.
 * That is, the output dimension of "prefix"
 * is equal to tile->depth.
 * The domain of "prefix" corresponds to the original statement instances,
 * i.e., those that appear in the domains of the access relations.
 *
 * Extract the tagged access relation of "group" and
 * then call remove_local_accesses.
 */
__isl_give isl_union_map *remove_local_accesses_group(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    __isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
    int read)
{
  isl_union_map *sched, *tagged;

  if (isl_union_map_is_empty(access))
    return access;

  tagged = group_tagged_access_relation(group);
  sched = isl_union_map_copy(prefix);

  return remove_local_accesses(kernel->prog, tagged, access, sched, read);
}

/* Compute the access relation for the access "ref".
 * Return the map D -> [S -> A]
 * where D is the iteration domain, S is the scheduling domains with the depth
 * of "node".
 */
__isl_give isl_union_map *io_comm_access_ref(
    struct autosa_kernel *kernel, __isl_keep isl_schedule_node *node,
    struct autosa_array_ref_group *group,
    struct autosa_stmt_access *ref,
    int read)
{
  isl_union_map *prefix;
  isl_union_map *access;  

  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  if (group->group_type == AUTOSA_IO_GROUP) {
    access = autosa_io_group_ref_access_relation(group, ref, read, !read);
  } else if (group->group_type == AUTOSA_DRAIN_GROUP) {
    access = autosa_drain_group_ref_access_relation(
        group, ref, read, !read, kernel->expanded_domain);
  }

  if (group->local_array->array_type == AUTOSA_INT_ARRAY)
    access = remove_local_accesses_group_flow(kernel, group, access, prefix, read);

  if (group->group_type == AUTOSA_IO_GROUP && group->attached_drain_group && !read) {
    // TODO: temporary solution. We assume the io group and attached drain group
    // always share the same access. Could be buggy.
    access = isl_union_map_union(access, 
                                 autosa_drain_group_ref_access_relation(group->attached_drain_group, ref, read, !read, kernel->expanded_domain));
  }

  access = isl_union_map_range_product(prefix, access);

  return access;
}

/* Compute the access relation for the accesses in the group.
 * Return the map D -> [S -> A]
 * where D is the iteration domain, S is the scheduling domains with the depth
 * of "node".
 */
__isl_give isl_union_map *io_comm_access(
    struct autosa_kernel *kernel, __isl_keep isl_schedule_node *node,
    struct autosa_array_ref_group *group, int read)
{
  isl_union_map *prefix;
  isl_union_map *access;

  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                            isl_union_pw_multi_aff_copy(kernel->contraction));
  access = isl_union_map_empty(isl_map_get_space(group->access));
  for (int i = 0; i < group->n_ref; i++)
  {
    struct autosa_stmt_access *ref = group->refs[i];
    if (group->group_type == AUTOSA_IO_GROUP) {
      access = isl_union_map_union(access, autosa_io_group_ref_access_relation(
                                               group, ref, read, !read));      
    } else if (group->group_type == AUTOSA_DRAIN_GROUP)
      access = isl_union_map_union(access, autosa_drain_group_ref_access_relation(
                                               group, ref, read, !read, kernel->expanded_domain));
  }

  if (group->attached_drain_group) {    
    for (int i = 0; i < group->attached_drain_group->n_ref; i++) {
      struct autosa_stmt_access *ref = group->attached_drain_group->refs[i];
      access = isl_union_map_union(access, autosa_drain_group_ref_access_relation(
                                               group->attached_drain_group, ref, read, !read, kernel->expanded_domain));      
    }
  }

  if (group->local_array->array_type == AUTOSA_INT_ARRAY)
    access = remove_local_accesses_group_flow(kernel, group, access, prefix, read);

  access = isl_union_map_range_product(prefix, access);

  return access;
}

void free_group_pair(void *user)
{
  struct autosa_array_ref_group_pair *pair =
      (struct autosa_array_ref_group_pair *)user;
  free(pair);
}

/* Create a register tiling at the "node" for array reference "ref".
 */
struct autosa_array_tile *create_register_tiling(
    isl_schedule_node *node,
    struct autosa_array_ref_group *group,
    struct autosa_stmt_access *ref)
{
  isl_union_map *access;
  isl_map *access_i;
  isl_ctx *ctx;
  isl_union_map *sched;
  isl_bool ok;
  struct autosa_array_tile *tile;
  isl_union_set *domain;

  ctx = isl_schedule_node_get_ctx(node);
  access = isl_union_map_from_map(isl_map_copy(ref->access));
  tile = autosa_array_tile_create(ctx, group->array->n_index);
  sched = isl_schedule_node_get_prefix_schedule_union_map(node);
  domain = isl_schedule_node_get_domain(node);
  sched = isl_union_map_intersect_domain(sched, domain);
  access = isl_union_map_apply_domain(access, sched);
  access_i = isl_map_from_union_map(access);
  ok = can_tile(access_i, tile);
  isl_map_free(access_i);
  autosa_array_ref_group_compute_tiling(tile, group);

  return tile;
}

/* Return the extent of "array", recomputed from the bounds.
 * The recomputed extent may be simpler than the original extent.
 */
static __isl_give isl_set *array_extent(struct autosa_array_info *array)
{
  int i;
  isl_id *id;
  isl_space *space;
  isl_local_space *ls;
  isl_set *extent;

  id = isl_set_get_tuple_id(array->extent);
  space = isl_set_get_space(array->extent);
  extent = isl_set_universe(isl_space_copy(space));
  ls = isl_local_space_from_space(space);
  for (i = 0; i < array->n_index; ++i)
  {
    isl_pw_aff *bound;
    isl_aff *aff;
    isl_pw_aff *index;
    isl_set *lt;

    extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);

    aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
                                isl_dim_set, i);
    index = isl_pw_aff_from_aff(aff);
    bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
    bound = isl_pw_aff_from_range(bound);
    bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
    bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
                                    isl_id_copy(id));
    lt = isl_pw_aff_lt_set(index, bound);
    extent = isl_set_intersect(extent, lt);
  }
  isl_local_space_free(ls);
  isl_id_free(id);

  return extent;
}

/* Return a map from the first group->local_tile->depth dimensions
 * of the computed schedule to the array tile in
 * global memory that corresponds to the local memory copy.
 *
 * In particular, return a map
 *
 *	{ D[i] -> A[a] }
 *
 * with constraints
 *
 *	tile_offset(i) <= a <= tile_offset(i) + tile_size - 1		(1)
 *
 * and
 *
 *	0 <= a <= array_size - 1					(2)
 *
 * Note that if some stride has been detected (i.e., when
 * group->local_tile->bound[i].shift is set), then a in (1) refers
 * to the shifted and scaled down version.
 *
 * Constraints (1) are obtained by mapping the size constraints on the
 * local memory tile back to the access relation.
 * Constraints (2) are obtained from the (recomputed) extent.
 */
__isl_give isl_map *group_tile(struct autosa_array_ref_group *group)
{
  int i;
  int n_index = group->array->n_index;
  isl_map *tile;
  isl_space *space;
  isl_set *local;
  isl_set *extent;

  space = isl_multi_aff_get_space(group->local_tile->tiling);
  space = isl_space_range(space);
  local = isl_set_universe(space);
  for (i = 0; i < n_index; ++i)
  {
    isl_val *bound;

    local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
    bound = isl_val_copy(group->local_tile->bound[i].size);
    bound = isl_val_sub_ui(bound, 1);
    local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
  }
  local = isl_set_preimage_multi_aff(local,
                                     isl_multi_aff_copy(group->local_tile->tiling));
  tile = isl_set_unwrap(local);
  extent = array_extent(group->array);
  tile = isl_map_intersect_range(tile, extent);

  return tile;
}

/* Return a map from the first tile->depth dimensions
 * of the computed schedule to the array tile in
 * global memory that corresponds to the local memory copy.
 *
 * In particular, return a map
 *
 *	{ D[i] -> A[a] }
 *
 * with constraints
 *
 *	tile_offset(i) <= a <= tile_offset(i) + tile_size - 1		(1)
 *
 * and
 *
 *	0 <= a <= array_size - 1					(2)
 *
 * Note that if some stride has been detected (i.e., when
 * group->local_tile->bound[i].shift is set), then a in (1) refers
 * to the shifted and scaled down version.
 *
 * Constraints (1) are obtained by mapping the size constraints on the
 * local memory tile back to the access relation.
 * Constraints (2) are obtained from the (recomputed) extent.
 */
__isl_give isl_map *group_tile_buffer(struct autosa_array_ref_group *group,
                                      struct autosa_array_tile *tile)
{
  int i;
  int n_index = group->array->n_index;
  isl_map *map;
  isl_space *space;
  isl_set *local;
  isl_set *extent;

  space = isl_multi_aff_get_space(tile->tiling);
  space = isl_space_range(space);
  local = isl_set_universe(space);

  for (i = 0; i < n_index; ++i)
  {
    isl_val *bound;

    local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
    bound = isl_val_copy(tile->bound[i].size);
    bound = isl_val_sub_ui(bound, 1);
    local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
  }
  local = isl_set_preimage_multi_aff(local,
                                     isl_multi_aff_copy(tile->tiling));
  map = isl_set_unwrap(local);
  extent = array_extent(group->array);
  map = isl_map_intersect_range(map, extent);

  return map;
}

/* Return the data packing factor used to trnasfer the data of "group" across
 * "module".
 * Specifically, we use data_pack_inter for IO modules.
 * For PE modules, if the array is an external array, it should equal to the 
 * io_group SIMD lane (group->n_lane).
 * If the array is an internal array, for the drain group, we use the SIMD lane,
 * for the io group, if the io is an exterior I/O, we use the SIMD lane, 
 * otherwise, we use the data packing factor of the local buffer 
 * (io_buffers[0]->n_lane) which is allocated inside the PE.
 */
int get_io_group_n_lane(struct autosa_hw_module *module,
                        struct autosa_pe_dummy_module *dummy_module,
                        struct autosa_array_ref_group *group)
{
  int n_lane;

  if (module && module->type == PE_MODULE || dummy_module)
  {
    n_lane = (group->local_array->array_type == AUTOSA_EXT_ARRAY) ? group->n_lane : ((group->group_type == AUTOSA_DRAIN_GROUP) ? group->n_lane : ((group->io_type == AUTOSA_EXT_IO) ? group->n_lane : group->io_buffers[0]->n_lane));
  }
  else
  {
    n_lane = module->data_pack_inter;
  }

  return n_lane;
}

/* Given a description of an array tile "tile" and the "space"
 *
 *	{ D -> A }
 *
 * where D represents the first tile->depth schedule dimensions
 * and A represents the array, construct an isl_multi_aff
 *
 *	{ [D[i] -> A[a]] -> A'[a'] }
 *
 * with A' a scaled down copy of A according to the shifts and strides
 * in "tile".  In particular,
 *
 *	a' = (a + shift(i))/stride
 *
 * "insert_array" represents
 *
 *	{ [D -> A] -> D }
 *
 * and is used to insert A into the domain of functions that only
 * reference D.
 */
static __isl_give isl_multi_aff *strided_tile_depth(
    struct autosa_array_tile *tile, __isl_keep isl_space *space,
    __isl_keep isl_multi_aff *insert_array, int depth)
{
  int i;
  isl_ctx *ctx;
  isl_multi_aff *shift;
  isl_multi_val *stride;
  isl_space *space2;
  isl_local_space *ls;
  isl_multi_aff *tiling;

  ctx = isl_space_get_ctx(space);
  space2 = isl_space_domain(isl_space_copy(space));
  ls = isl_local_space_from_space(space2);
  space2 = isl_space_range(isl_space_copy(space));
  stride = isl_multi_val_zero(space2);
  shift = isl_multi_aff_zero(isl_space_copy(space));

  for (i = 0; i < tile->n; ++i)
  {
    struct autosa_array_bound *bound = &tile->bound[i];
    isl_val *stride_i;
    isl_aff *shift_i;

    stride_i = isl_val_copy(bound->stride);
    shift_i = isl_aff_copy(bound->shift);

    shift_i = isl_aff_insert_dims(shift_i, isl_dim_in, tile->depth, depth - tile->depth);

    stride = isl_multi_val_set_val(stride, i, stride_i);
    shift = isl_multi_aff_set_aff(shift, i, shift_i);
  }
  isl_local_space_free(ls);

  shift = isl_multi_aff_pullback_multi_aff(shift,
                                           isl_multi_aff_copy(insert_array));

  tiling = isl_multi_aff_range_map(isl_space_copy(space));
  tiling = isl_multi_aff_add(tiling, shift);
  tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);

  return tiling;
}

/* Recompute the tiling by extending the scheduling domain to the "depth". */
__isl_give isl_multi_aff *autosa_array_ref_group_recompute_tiling(
    struct autosa_array_tile *tile,
    struct autosa_array_ref_group *group,
    int depth)
{
  int i;
  isl_space *space;
  isl_multi_aff *tiling, *lb, *insert_array;
  isl_printer *p;
  char *local_name;

  if (tile == NULL)
    return NULL;

  space = isl_map_get_space(group->access);
  space = isl_space_from_range(isl_space_range(space));
  /* Build D[i] -> A[a] */
  space = isl_space_add_dims(space, isl_dim_in, depth);
  /* Build [D[i] -> A[a]] -> D[i] */
  insert_array = isl_multi_aff_domain_map(isl_space_copy(space));

  for (i = 0; i < tile->n; ++i)
    if (tile->bound[i].shift)
      break;

  if (i < tile->n)
    tiling = strided_tile_depth(tile, space, insert_array, depth);
  else
    tiling = isl_multi_aff_range_map(isl_space_copy(space));

  lb = isl_multi_aff_zero(space);
  for (i = 0; i < tile->n; ++i)
  {
    isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
    lb_i = isl_aff_insert_dims(lb_i, isl_dim_in, tile->depth, depth - tile->depth);
    lb = isl_multi_aff_set_aff(lb, i, lb_i);
  }
  lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);

  tiling = isl_multi_aff_sub(tiling, lb);

  p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
  p = autosa_array_ref_group_print_name(group, p);
  local_name = isl_printer_get_str(p);
  isl_printer_free(p);
  tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
  free(local_name);

  return tiling;
}

void print_io_grouping_info(FILE *fp, struct autosa_kernel *kernel)
{
  const char *io_types[] = {"AUTOSA_INT_IO", "AUTOSA_EXT_IO", "AUTOSA_UNKNOWN_IO"};

  fprintf(fp, "================= IO Grouping Information =================\n");
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    fprintf(fp, "===================================================\n");
    fprintf(fp, "Array: %s\n", local->array->name);
    fprintf(fp, "===================================================\n");
    fprintf(fp, "local: %d\n", local->array->local);
    fprintf(fp, "n_io_groups: %d\n", local->n_io_group);
    fprintf(fp, "n_drain_groups: %d\n", local->drain_group? 1 : 0);
    for (int j = 0; j < local->n_io_group; j++) {
      struct autosa_array_ref_group *group = local->io_groups[j];
      fprintf(fp, "------------------------------\n");
      fprintf(fp, "IO Group: %d\n", j);
      fprintf(fp, "------------------------------\n");
      fprintf(fp, "copy_in: %d\n", group->copy_in);
      fprintf(fp, "copy_out: %d\n", group->copy_out);
      fprintf(fp, "io_type: %s\n", io_types[group->io_type]);
      char *vec_str = isl_vec_to_str(group->dir);
      fprintf(fp, "io_dir: %s\n", vec_str);
      free(vec_str);
    }
    if (local->drain_group) {
      struct autosa_array_ref_group *group = local->drain_group;
      fprintf(fp, "------------------------------\n");
      fprintf(fp, "Drain Group\n");
      fprintf(fp, "------------------------------\n");
      fprintf(fp, "copy_in: %d\n", group->copy_in);
      fprintf(fp, "copy_out: %d\n", group->copy_out);
      fprintf(fp, "io_type: %s\n", io_types[group->io_type]);
      char *vec_str = isl_vec_to_str(group->dir);
      fprintf(fp, "io_dir: %s\n", vec_str);
      free(vec_str);      
    }
  }
  fprintf(fp, "================= IO Grouping Information =================\n");
}

================================================
FILE: src/autosa_comm.h
================================================
#ifndef _AUTOSA_COMM_H
#define _AUTOSA_COMM_H

#include "autosa_common.h"

#if defined(__cplusplus)
extern "C" {
#endif   

isl_stat sa_io_construct_optimize(struct autosa_kernel *kernel, struct autosa_gen *gen);
enum autosa_group_access_type autosa_array_ref_group_type(
	struct autosa_array_ref_group *group);
enum autosa_group_access_type autosa_cpu_array_ref_group_type(
	struct autosa_array_ref_group *group);	
struct autosa_array_tile *autosa_array_ref_group_tile(
	struct autosa_array_ref_group *group);  
__isl_give isl_printer *autosa_array_ref_group_print_name(
	struct autosa_array_ref_group *group, __isl_take isl_printer *p);
__isl_give isl_union_map *autosa_io_group_ref_access_relation(
  struct autosa_array_ref_group *group,
  struct autosa_stmt_access *ref,
  int read, int write);
__isl_give isl_union_map *autosa_array_ref_group_access_relation(
	struct autosa_array_ref_group *group, int read, int write);	
__isl_give isl_union_map *autosa_io_group_access_relation(
  struct autosa_array_ref_group *group, 
  struct autosa_kernel *kernel,
  int read, int write);
__isl_give isl_union_map *autosa_drain_group_ref_access_relation(
  struct autosa_array_ref_group *group,
  struct autosa_stmt_access *ref,
  int read, int write, __isl_keep isl_union_set *domain);	
__isl_give isl_union_map *group_tagged_access_relation(
	struct autosa_array_ref_group *group);
__isl_give isl_union_map *remove_local_accesses_flow(
	struct autosa_prog *prog, __isl_take isl_union_map *tagged,
	__isl_take isl_union_map *access, __isl_take isl_union_map *sched,
	int read);	
__isl_give isl_union_map *wrapped_reference_to_access(
	__isl_take isl_union_set *ref, __isl_take isl_union_map *tagged);	
__isl_give isl_union_map *remove_local_accesses(
	struct autosa_prog *prog, __isl_take isl_union_map *tagged,
	__isl_take isl_union_map *access, __isl_take isl_union_map *sched,
	int read);	
__isl_give isl_union_map *remove_local_accesses_group_flow(
	struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
	__isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
	int read);
__isl_give isl_union_map *remove_local_accesses_group(
	struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
	__isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
	int read);	
__isl_give isl_union_map *io_comm_access_ref(
  struct autosa_kernel *kernel, __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group, 
  struct autosa_stmt_access *ref,
  int read);	
__isl_give isl_union_map *io_comm_access(
  struct autosa_kernel *kernel, __isl_keep isl_schedule_node *node,
  struct autosa_array_ref_group *group, int read);	
void free_group_pair(void *user);
struct autosa_array_tile *create_register_tiling(
  isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  struct autosa_stmt_access *ref);
__isl_give isl_map *group_tile(struct autosa_array_ref_group *group);	
__isl_give isl_map *group_tile_buffer(struct autosa_array_ref_group *group,
  struct autosa_array_tile *tile);
int get_io_group_n_lane(struct autosa_hw_module *module, 
  struct autosa_pe_dummy_module *dummy_module,
  struct autosa_array_ref_group *group);
__isl_give isl_multi_aff *autosa_array_ref_group_recompute_tiling(
  struct autosa_array_tile *tile,
  struct autosa_array_ref_group *group,
  int depth);  
isl_bool is_io_module_valid(
  __isl_keep isl_schedule_node *node,  
  struct autosa_kernel *kernel, 
  struct autosa_array_ref_group *group, int read);  
void print_io_grouping_info(FILE *fp, struct autosa_kernel *kernel);

#if defined(__cplusplus)
}
#endif 

#endif

================================================
FILE: src/autosa_common.cpp
================================================
/* Defines functions used for AutoSA structs. */

#include <isl/id.h>
#include <cJSON/cJSON.h>

#include "autosa_common.h"
#include "autosa_utils.h"
#include "autosa_print.h"

/****************************************************************
 * AutoSA kernel
 ****************************************************************/
/* Free the AutoSA kernel struct. */
void *autosa_kernel_free(struct autosa_kernel *kernel)
{
  if (!kernel)
    return NULL;

  isl_schedule_free(kernel->schedule);
  isl_ast_node_free(kernel->tree);
  isl_union_map_free(kernel->sizes);
  isl_union_map_free(kernel->used_sizes);
  isl_union_set_free(kernel->core);
  isl_set_free(kernel->context);
  isl_multi_pw_aff_free(kernel->sa_grid_size);
  isl_union_set_free(kernel->arrays);
  isl_union_pw_multi_aff_free(kernel->copy_schedule);
  isl_space_free(kernel->space);
  isl_id_list_free(kernel->block_ids);
  isl_id_list_free(kernel->thread_ids);
  isl_id_list_free(kernel->pe_ids);
  isl_union_set_free(kernel->pe_filter);
  isl_multi_pw_aff_free(kernel->grid_size);
  isl_ast_expr_free(kernel->grid_size_expr);
  isl_union_pw_multi_aff_free(kernel->contraction);
  isl_union_set_free(kernel->expanded_domain);
  isl_set_free(kernel->host_domain);
  isl_union_set_free(kernel->domain);
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *array = &kernel->array[i];
    for (int j = 0; j < array->n_group; ++j)
      autosa_array_ref_group_free(array->groups[j]);
    free(array->groups);    
    for (int j = 0; j < array->n_pe_group; ++j)
      autosa_array_ref_group_free(array->pe_groups[j]);
    free(array->pe_groups);
    for (int j = 0; j < array->n_io_group; ++j)
      autosa_array_ref_group_free(array->io_groups[j]);
    free(array->io_groups);
    autosa_array_ref_group_free(array->drain_group);

    isl_multi_pw_aff_free(array->bound);
    isl_ast_expr_free(array->bound_expr);
    
    isl_pw_qpolynomial_free(array->serialize_bound);
  }
  if (kernel->array) {
    delete[] kernel->array;
    //free(kernel->array);
  }

  for (int i = 0; i < kernel->n_var; i++)
  {
    free(kernel->var[i].name);
    isl_vec_free(kernel->var[i].size);
  }
  free(kernel->var);
  delete kernel->tuning_program;

  free(kernel);
  return NULL;
}

/* Copy a new AutoSA kernel struct. */
struct autosa_kernel *autosa_kernel_copy(struct autosa_kernel *kernel)
{
  struct autosa_kernel *kernel_dup = (struct autosa_kernel *)malloc(
      sizeof(struct autosa_kernel));
  kernel_dup->ctx = kernel->ctx;
  kernel_dup->schedule = isl_schedule_copy(kernel->schedule);
  kernel_dup->scop = kernel->scop;
  kernel_dup->options = kernel->options;
  kernel_dup->n_sa_dim = kernel->n_sa_dim;
  for (int i = 0; i < kernel->n_sa_dim; i++)
  {
    kernel_dup->sa_dim[i] = kernel->sa_dim[i];
  }
  kernel_dup->array_part_w = kernel->array_part_w;
  kernel_dup->space_w = kernel->space_w;
  kernel_dup->time_w = kernel->time_w;
  kernel_dup->type = kernel->type;
  kernel_dup->sa_grid_size = isl_multi_pw_aff_copy(kernel->sa_grid_size);
  kernel_dup->sizes = isl_union_map_copy(kernel->sizes);
  kernel_dup->used_sizes = isl_union_map_copy(kernel->used_sizes);
  kernel_dup->id = kernel->id;
  kernel_dup->space_time_id = kernel->space_time_id;
  kernel_dup->core = isl_union_set_copy(kernel->core);
  kernel_dup->arrays = isl_union_set_copy(kernel->arrays);
  kernel_dup->n_array = kernel->n_array;
  kernel_dup->array = kernel->array;
  kernel_dup->copy_schedule = isl_union_pw_multi_aff_copy(kernel->copy_schedule);
  kernel_dup->copy_schedule_dim = kernel->copy_schedule_dim;
  kernel_dup->space = isl_space_copy(kernel->space);
  kernel_dup->tree = isl_ast_node_copy(kernel->tree);
  kernel_dup->n_var = kernel->n_var;
  kernel_dup->var = kernel->var;
  kernel_dup->block_ids = isl_id_list_copy(kernel->block_ids);
  kernel_dup->thread_ids = isl_id_list_copy(kernel->thread_ids);
  kernel_dup->pe_ids = isl_id_list_copy(kernel->pe_ids);
  kernel_dup->pe_filter = isl_union_set_copy(kernel->pe_filter);
  kernel_dup->n_grid = kernel->n_grid;
  kernel_dup->n_block = kernel->n_block;
  for (int i = 0; i < kernel->n_grid; i++)
  {
    kernel_dup->grid_dim[i] = kernel->grid_dim[i];
  }
  for (int i = 0; i < kernel->n_block; i++)
  {
    kernel_dup->block_dim[i] = kernel->block_dim[i];
  }
  kernel_dup->grid_size = isl_multi_pw_aff_copy(kernel->grid_size);
  kernel_dup->grid_size_expr = isl_ast_expr_copy(kernel->grid_size_expr);
  kernel_dup->context = isl_set_copy(kernel->context);
  kernel_dup->contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
  kernel_dup->expanded_domain = isl_union_set_copy(kernel->expanded_domain);
  kernel_dup->host_domain = isl_set_copy(kernel->host_domain);
  kernel_dup->domain = isl_union_set_copy(kernel->domain);
  kernel_dup->single_statement = kernel->single_statement;
  kernel_dup->sparse = kernel->sparse;
  kernel_dup->vec_len = kernel->vec_len;
  kernel_dup->n_nzero = kernel->n_nzero;
  kernel_dup->compress_ratio = kernel->compress_ratio;
  kernel_dup->n_meta_data = kernel->n_meta_data;
  kernel_dup->eff_compress_ratio = kernel->eff_compress_ratio;

  // TODO: Deep-copy
  kernel_dup->tuning_program = kernel->tuning_program;

  return kernel_dup;
}

/* Allocate a new AutoSA kernel struct with the given schedule. */
struct autosa_kernel *autosa_kernel_from_schedule(__isl_take isl_schedule *schedule)
{
  struct autosa_kernel *kernel = (struct autosa_kernel *)malloc(
      sizeof(struct autosa_kernel));
  kernel->ctx = isl_schedule_get_ctx(schedule);
  kernel->schedule = schedule;
  kernel->scop = NULL;
  kernel->prog = NULL;
  kernel->options = NULL;
  kernel->n_sa_dim = 0;
  kernel->array_part_w = 0;
  kernel->space_w = 0;
  kernel->time_w = 0;
  kernel->type = 0;
  kernel->sa_grid_size = NULL;
  kernel->sizes = NULL;
  kernel->used_sizes = NULL;
  kernel->id = 0;
  kernel->core = NULL;
  kernel->arrays = NULL;
  kernel->n_array = 0;
  kernel->array = NULL;
  kernel->copy_schedule = NULL;
  kernel->copy_schedule_dim = -1;
  kernel->space = NULL;
  kernel->tree = NULL;
  kernel->n_var = 0;
  kernel->var = NULL;
  kernel->block_ids = NULL;
  kernel->thread_ids = NULL;
  kernel->pe_ids = NULL;
  kernel->pe_filter = NULL;
  kernel->n_grid = 0;
  kernel->n_block = 0;
  kernel->grid_size = NULL;
  kernel->grid_size_expr = NULL;
  kernel->context = NULL;
  kernel->contraction = NULL;
  kernel->expanded_domain = NULL;
  kernel->host_domain = NULL;
  kernel->domain = NULL;
  kernel->single_statement = 0;
  kernel->sparse = 0;
  kernel->vec_len = 0;
  kernel->n_nzero = 0;
  kernel->compress_ratio = 0;
  kernel->n_meta_data = 0;
  kernel->eff_compress_ratio = 0;
  kernel->tuning_program = NULL;

  return kernel;
}

struct autosa_kernel *autosa_kernel_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
{
  struct autosa_kernel *kernel;
  isl_space *space;
  isl_map *id;

  if (!scop)
    return NULL;

  kernel = isl_calloc_type(ctx, struct autosa_kernel);
  if (!kernel)
    return NULL;

  kernel->ctx = ctx;
  kernel->scop = scop;
  kernel->prog = NULL;
  kernel->options = NULL;
  kernel->n_sa_dim = 0;
  kernel->array_part_w = 0;
  kernel->space_w = 0;
  kernel->time_w = 0;
  kernel->type = 0;
  kernel->sa_grid_size = NULL;
  kernel->sizes = NULL;
  kernel->used_sizes = NULL;
  kernel->id = 0;
  kernel->core = NULL;
  kernel->arrays = NULL;
  kernel->n_array = 0;
  kernel->array = NULL;
  kernel->copy_schedule = NULL;
  kernel->copy_schedule_dim = -1;
  kernel->space = NULL;
  kernel->tree = NULL;
  kernel->n_var = 0;
  kernel->var = NULL;
  kernel->block_ids = NULL;
  kernel->thread_ids = NULL;
  kernel->pe_ids = NULL;
  kernel->pe_filter = NULL;
  kernel->n_grid = 0;
  kernel->n_block = 0;
  kernel->grid_size = NULL;
  kernel->grid_size_expr = NULL;
  kernel->context = NULL;
  kernel->contraction = NULL;
  kernel->expanded_domain = NULL;
  kernel->host_domain = NULL;
  kernel->domain = NULL;
  kernel->single_statement = 0;  
  kernel->sparse = 0;
  kernel->vec_len = 0;
  kernel->n_nzero = 0;
  kernel->compress_ratio = 0;
  kernel->n_meta_data = 0;
  kernel->eff_compress_ratio = 0;
  kernel->tuning_program = NULL;

  return kernel;
}

/****************************************************************
 * AutoSA access
 ****************************************************************/
/* Create an identical mapping. */
static __isl_give isl_map *same(__isl_take isl_space *domain_space)
{
  isl_space *space;
  isl_aff *aff;
  isl_multi_aff *next;

  space = isl_space_map_from_set(domain_space);
  next = isl_multi_aff_identity(space);

  return isl_map_from_multi_aff(next);
}

/* Construct a map from domain_space to domain_space that increments
 * the dimension at position "pos" and leaves all other dimensions constant. 
 */
static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos)
{
  isl_space *space;
  isl_aff *aff;
  isl_multi_aff *next;

  space = isl_space_map_from_set(domain_space);
  next = isl_multi_aff_identity(space);
  aff = isl_multi_aff_get_aff(next, pos);
  aff = isl_aff_add_constant_si(aff, 1);
  next = isl_multi_aff_set_aff(next, pos, aff);

  return isl_map_from_multi_aff(next);
}

/* Check is the "access" has stride-0 access at dim "pos".
 * The access is already transformed to scheduling domains. 
 * We first create an identical mapping "next_element"that maps the accessed 
 * elements to the same elements. 
 * Then, we create a mapping "map" that maps the array elements accessed by the 
 * current iteration to the elements accssed by the next iteration.
 * We examine if the access is stride-0 by testing if map is the subset of 
 * "next_element".
 */
isl_bool access_is_stride_zero(__isl_keep isl_map *access, int pos)
{
  isl_space *space;
  int dim;
  isl_map *next_element, *map, *next_iter;
  isl_set *accessed;
  isl_bool empty, zero;

  space = isl_map_get_space(access);
  space = isl_space_range(space);
  dim = isl_space_dim(space, isl_dim_set);
  if (dim == 0)
    next_element = isl_map_empty(isl_space_map_from_set(space));
  else
    next_element = same(space);

  accessed = isl_map_range(isl_map_copy(access));
  map = isl_map_copy(next_element);
  map = isl_map_intersect_domain(map, isl_set_copy(accessed));
  map = isl_map_intersect_range(map, accessed);
  empty = isl_map_is_empty(map);
  isl_map_free(map);

  if (empty < 0 || empty)
  {
    isl_map_free(next_element);
    return empty;
  }

  space = isl_map_get_space(access);
  space = isl_space_domain(space);
  next_iter = next(space, isl_map_dim(access, isl_dim_in) - 1);
  map = isl_map_apply_domain(next_iter, isl_map_copy(access));
  map = isl_map_apply_range(map, isl_map_copy(access));
  zero = isl_map_is_subset(map, next_element);

  isl_map_free(next_element);
  isl_map_free(map);

  return zero;
}

/* Check is the "access" has stride-1 access at dim "pos".
 * The access is already transformed to scheduling domains. 
 * We first create a mapping "next_element"that maps the accessed 
 * elements to the elements with a stride of one. 
 * Then, we create a mapping "map" that maps the array elements accessed by the 
 * current iteration to the elements accssed by the next iteration.
 * We examine if the access is stride-1 by testing if map is the subset of 
 * "next_element".
 */
isl_bool access_is_stride_one(__isl_keep isl_map *access, int pos)
{
  isl_space *space;
  int dim;
  isl_map *next_element, *map, *next_iter;
  isl_set *accessed;
  isl_bool empty, coalesced;

  space = isl_map_get_space(access);
  space = isl_space_range(space);
  dim = isl_space_dim(space, isl_dim_set);
  if (dim == 0)
    next_element = isl_map_empty(isl_space_map_from_set(space));
  else
    next_element = next(space, pos);

  accessed = isl_map_range(isl_map_copy(access));
  map = isl_map_copy(next_element);
  map = isl_map_intersect_domain(map, isl_set_copy(accessed));
  map = isl_map_intersect_range(map, accessed);
  empty = isl_map_is_empty(map);
  isl_map_free(map);

  if (empty < 0 || empty)
  {
    isl_map_free(next_element);
    return empty;
  }

  space = isl_map_get_space(access);
  space = isl_space_domain(space);
  next_iter = next(space, isl_map_dim(access, isl_dim_in) - 1);  
  map = isl_map_apply_domain(next_iter, isl_map_copy(access));
  map = isl_map_apply_range(map, isl_map_copy(access));
  if (isl_map_is_empty(map))
  {
    isl_map_free(next_element);
    isl_map_free(map);
    return isl_bool_false;
  }
  coalesced = isl_map_is_subset(map, next_element);

  isl_map_free(next_element);
  isl_map_free(map);

  return coalesced;
}

void *autosa_acc_free(struct autosa_acc *acc)
{
  if (!acc)
    return NULL;

  isl_map_free(acc->tagged_map);
  isl_map_free(acc->map);
  isl_space_free(acc->id);

  free(acc);

  return NULL;
}

struct autosa_io_buffer *autosa_io_buffer_alloc()
{
  struct autosa_io_buffer *io_buffer = (struct autosa_io_buffer *)malloc(sizeof(struct autosa_io_buffer));
  io_buffer->tile = NULL;
  io_buffer->level = -1;
  io_buffer->n_lane = -1;
  io_buffer->serialize = -1;
  io_buffer->sparse = -1;
  io_buffer->vec_len = -1;  
  io_buffer->tuning_tile = NULL;
  io_buffer->hoist_depth = -1;
  io_buffer->hoist_domain = NULL;

  return io_buffer;
}

/****************************************************************
 * AutoSA dep
 ****************************************************************/
/* Free up the dependence. */
void *autosa_dep_free(__isl_take struct autosa_dep *dep)
{
  if (!dep)
    return NULL;

  if (dep->src)
    dep->src = isl_id_free(dep->src);
  if (dep->dest)
    dep->dest = isl_id_free(dep->dest);
  if (dep->disvec)
    isl_vec_free(dep->disvec);
  if (dep->src_sched_domain)
    isl_set_free(dep->src_sched_domain);
  if (dep->dest_sched_domain)
    isl_set_free(dep->dest_sched_domain);
  if (dep->isl_dep)
    isl_basic_map_free(dep->isl_dep);

  free(dep);

  return NULL;
}

/****************************************************************
 * AutoSA iterator
 ****************************************************************/

__isl_null struct autosa_iter *autosa_iter_free(struct autosa_iter *iter)
{
  if (!iter)
    return NULL;

  free(iter->name);
  free(iter->ts_name);
  isl_aff_free(iter->lb);
  isl_aff_free(iter->ub);

  free(iter);

  return NULL;
}

/****************************************************************
 * AutoSA array
 ****************************************************************/

static void free_array_info(struct autosa_prog *prog)
{
  int i;

  for (i = 0; i < prog->n_array; ++i)
  {
    free(prog->array[i].type);
    free(prog->array[i].name);
    isl_multi_pw_aff_free(prog->array[i].bound);
    isl_ast_expr_free(prog->array[i].bound_expr);
    isl_space_free(prog->array[i].space);
    isl_set_free(prog->array[i].declared_extent);
    isl_set_free(prog->array[i].extent);
    isl_ast_expr_free(prog->array[i].declared_size);
    free(prog->array[i].refs);
    isl_union_map_free(prog->array[i].dep_order);
  }
  //free(prog->array);
  delete[] prog->array;
}

/* Is the array "array" being extracted a read-only scalar?
 *
 * That is, is "array" a scalar that is never possibly written to.
 * An array containing structures is never considered to be a scalar.
 */
static int is_read_only_scalar(struct autosa_array_info *array,
                               struct autosa_prog *prog)
{
  isl_set *space;
  isl_union_map *write;
  int empty;

  if (array->has_compound_element)
    return 0;
  if (array->n_index != 0)
    return 0;

  write = isl_union_map_copy(prog->may_write);
  space = isl_set_universe(isl_space_copy(array->space));
  write = isl_union_map_intersect_range(write,
                                        isl_union_set_from_set(space));
  empty = isl_union_map_is_empty(write);
  isl_union_map_free(write);

  return empty;
}

/* Compute and return the extent of "array", taking into account the set of
 * accessed elements.
 *
 * In particular, the extent in the outer dimension is taken
 * from "accessed", while the extents in the remaining dimensions
 * are taken from array->extent.
 *
 * The extent in the outer dimension cannot be taken from array->extent
 * because that may be unbounded.  Furthermore, even if it is bounded,
 * it may be larger than the piece of the array that is being accessed.
 */
static __isl_give isl_set *compute_extent(struct pet_array *array,
                                          __isl_keep isl_set *accessed)
{
  int n_index;
  isl_id *id;
  isl_set *outer;
  isl_set *extent;

  extent = isl_set_copy(array->extent);

  n_index = isl_set_dim(accessed, isl_dim_set);
  if (n_index == 0)
    return extent;

  extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
  outer = isl_set_copy(accessed);
  outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
  extent = isl_set_flat_product(outer, extent);
  id = isl_set_get_tuple_id(accessed);
  extent = isl_set_set_tuple_id(extent, id);

  return extent;
}

/* Return the name of the outer array (of structs) accessed by "access".
 */
static const char *get_outer_array_name(__isl_keep isl_map *access)
{
  isl_space *space;
  const char *name;

  space = isl_space_range(isl_map_get_space(access));
  while (space && isl_space_is_wrapping(space))
    space = isl_space_domain(isl_space_unwrap(space));
  name = isl_space_get_tuple_name(space, isl_dim_set);
  isl_space_free(space);

  return name;
}

/* Collect all references to the given array and store pointers to them
 * in array->refs.
 */
static isl_stat collect_references(struct autosa_prog *prog,
                                   struct autosa_array_info *array)
{
  int i;
  int n;

  n = 0;
  for (i = 0; i < prog->n_stmts; ++i)
  {
    struct autosa_stmt *stmt = &prog->stmts[i];
    struct autosa_stmt_access *access;

    for (access = stmt->accesses; access; access = access->next)
    {
      const char *name;
      name = get_outer_array_name(access->access);
      if (name && !strcmp(array->name, name))
        n++;
    }
  }

  array->refs = isl_alloc_array(prog->ctx, struct autosa_stmt_access *, n);
  if (!array->refs)
    return isl_stat_error;
  array->n_ref = n;

  n = 0;
  for (i = 0; i < prog->n_stmts; ++i)
  {
    struct autosa_stmt *stmt = &prog->stmts[i];
    struct autosa_stmt_access *access;

    for (access = stmt->accesses; access; access = access->next)
    {
      const char *name;
      name = get_outer_array_name(access->access);
      if (!name || strcmp(array->name, name))
        continue;

      array->refs[n++] = access;
    }
  }

  return isl_stat_ok;
}

/* Is "array" only accessed as individual, fixed elements?
 * That is, does each access to "array" access a single, fixed element?
 */
static isl_bool only_fixed_element_accessed(struct autosa_array_info *array)
{
  int i;

  for (i = 0; i < array->n_ref; ++i)
    if (!array->refs[i]->fixed_element)
      return isl_bool_false;

  return isl_bool_true;
}

/* Compute bounds on the host array "pa" based on the corresponding
 * accessed elements in "arrays"
 * and collect all references to the array.
 * Store the results in "info".
 *
 * If the array is zero-dimensional and does not contain structures,
 * i.e., if the array is a scalar, we check whether it is read-only.
 * We also check whether the array is accessed at all.
 */
static isl_stat extract_array_info(struct autosa_prog *prog,
                                   struct autosa_array_info *info, struct pet_array *pa,
                                   __isl_keep isl_union_set *arrays)
{
  int empty;
  const char *name;
  int n_index;
  isl_multi_pw_aff *bounds;
  isl_set *accessed, *extent;

  n_index = isl_set_dim(pa->extent, isl_dim_set);
  name = isl_set_get_tuple_name(pa->extent);

  info->space = isl_set_get_space(pa->extent);
  info->name = strdup(name);
  info->n_index = n_index;
  info->linearize = prog->scop->options->linearize_device_arrays;

  info->type = strdup(pa->element_type);
  info->size = pa->element_size;
  info->local = pa->declared && !pa->exposed;
  info->has_compound_element = pa->element_is_record;
  info->read_only_scalar = is_read_only_scalar(info, prog);

  info->declared_extent = isl_set_copy(pa->extent);
  accessed = isl_union_set_extract_set(arrays,
                                       isl_space_copy(info->space));
  empty = isl_set_is_empty(accessed);
  extent = compute_extent(pa, accessed);
  isl_set_free(accessed);
  info->extent = extent;
  if (empty < 0)
    return isl_stat_error;
  info->accessed = !empty;
  bounds = ppcg_size_from_extent(isl_set_copy(extent));
  bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context));
  if (!bounds)
    return isl_stat_error;
  if (!isl_multi_pw_aff_is_cst(bounds))
    info->linearize = prog->scop->options->linearize_device_arrays;
    //info->linearize = 1;
  info->bound = bounds;

  if (collect_references(prog, info) < 0)
    return isl_stat_error;
  info->only_fixed_element = only_fixed_element_accessed(info);
  info->declare_local = 0;
  info->dep_order = NULL;
  info->declared_size = NULL;
  info->global = 0;
  info->bound_expr = NULL;

  /* AutoSA Extended */
  info->n_lane = 0;
  info->local_array = NULL;
  info->copy_in = 0;
  info->copy_out = 0;  
  /* AutoSA Extended */

  return isl_stat_ok;
}

/* Remove independence from the order constraints "order" on array "array".
 * Since the pairs of iterations in the filter relation of an independence
 * are guaranteed to be completely independent by the user, there is
 * no need to ensure that live ranges are ordered along those pairs.
 * We make an exception for local variables, though, as the independence
 * guarantee does not apply to those.
 *
 * The order constraints are used in two places.
 * Those on scalars are used in check_scalar_live_ranges to check if
 * we need to force the scalar to be private.  Any non-local scalar
 * should not be forced scalar if it only appears in independent loops.
 * Those on non-scalars are added to the coincidence constraints
 * in compute_schedule because we do not support any array expansion.
 * Accesses to non-local arrays should not prevent a loop from being
 * considered coincident so we should indeed remove those constraints
 * from the order constraints.
 */
static __isl_give isl_union_map *remove_independences(struct autosa_prog *prog,
                                                      struct autosa_array_info *array, __isl_take isl_union_map *order)
{
  int i;

  for (i = 0; i < prog->scop->pet->n_independence; ++i)
  {
    struct pet_independence *pi = prog->scop->pet->independences[i];
    if (isl_union_set_contains(pi->local, array->space))
      continue;

    order = isl_union_map_subtract(order,
                                   isl_union_map_copy(pi->filter));
  }

  return order;
}

/* Can "array" be mapped to private memory?
 * That is, is it only accessed as individual elements with
 * constant index expressions?
 */
static isl_bool autosa_array_can_be_private(struct autosa_array_info *array)
{
  if (!array)
    return isl_bool_error;
  return array->only_fixed_element ? isl_bool_true : isl_bool_false;
}

/* For each array in "prog", store the (untagged) order dependences
 * derived from the array in array->dep_order.
 * In particular, consider all references that access the given array
 * and take the order dependences that have one of these references
 * as source.  (Since an order dependence relates two references to
 * the same array, the target of these order dependences will also
 * be one of these references.)
 * Additionally, store the union of these array->dep_order relations
 * for all arrays that cannot be mapped to private memory in prog->array_order.
 */
static void collect_order_dependences(struct autosa_prog *prog)
{
  int i;
  isl_space *space;
  isl_union_map *accesses;

  space = isl_union_map_get_space(prog->read);
  prog->array_order = isl_union_map_empty(space);

  accesses = isl_union_map_copy(prog->scop->tagged_reads);
  accesses = isl_union_map_union(accesses,
                                 isl_union_map_copy(prog->scop->tagged_may_writes));
  accesses = isl_union_map_universe(accesses);
  accesses = isl_union_map_apply_range(accesses,
                                       isl_union_map_copy(prog->to_outer));

  for (i = 0; i < prog->n_array; ++i)
  {
    struct autosa_array_info *array = &prog->array[i];
    isl_set *set;
    isl_union_set *uset;
    isl_union_map *order;

    set = isl_set_universe(isl_space_copy(array->space));
    uset = isl_union_set_from_set(set);
    uset = isl_union_map_domain(
        isl_union_map_intersect_range(isl_union_map_copy(accesses),
                                      uset));
    order = isl_union_map_copy(prog->scop->tagged_dep_order);
    order = isl_union_map_intersect_domain(order, uset);
    order = isl_union_map_zip(order);
    order = isl_union_set_unwrap(isl_union_map_domain(order));
    order = remove_independences(prog, array, order);
    array->dep_order = order;

    if (autosa_array_can_be_private(array))
      continue;

    prog->array_order = isl_union_map_union(prog->array_order,
                                            isl_union_map_copy(array->dep_order));
  }

  isl_union_map_free(accesses);
}

/* Construct a autosa_array_info for each array referenced by prog->scop and
 * collect them in prog->array.
 * 
 * The sizes are based on the extents and the set of possibly accessed
 * elements by "prog".
 * If there are any member accesses involved, then they are first mapped
 * to the outer arrays of structs.
 * Only extract autosa_array_info entries for these outer arrays.
 * 
 * If we are allowing live range reordering, then also set 
 * the dep_order field. Otherwise leave it NULL.
 */
isl_stat collect_array_info(struct autosa_prog *prog)
{
  int i;
  isl_stat r = isl_stat_ok;
  isl_union_set *arrays;

  prog->n_array = 0;
  //prog->array = isl_calloc_array(prog->ctx,
  //                               struct autosa_array_info, prog->scop->pet->n_array);
  prog->array = new autosa_array_info[prog->scop->pet->n_array];
  if (!prog->array)
    return isl_stat_error;

  arrays = isl_union_map_range(isl_union_map_copy(prog->read));
  arrays = isl_union_set_union(arrays,
                               isl_union_map_range(isl_union_map_copy(prog->may_write)));

  arrays = isl_union_set_apply(arrays,
                               isl_union_map_copy(prog->to_outer));

  arrays = isl_union_set_coalesce(arrays);

  for (i = 0; i < prog->scop->pet->n_array; ++i)
  {
    isl_bool field;

    field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent);
    if (field < 0)
      break;
    if (field)
      continue;
    if (extract_array_info(prog, &prog->array[prog->n_array++],
                           prog->scop->pet->arrays[i], arrays) < 0)
      r = isl_stat_error;
  }
  if (i < prog->scop->pet->n_array)
    r = isl_stat_error;

  isl_union_set_free(arrays);

  if (prog->scop->options->live_range_reordering)
    collect_order_dependences(prog);

  return r;
}

/* Is "array" a read-only scalar?
 */
int autosa_array_is_read_only_scalar(struct autosa_array_info *array)
{
  return array->read_only_scalar;
}

/* Check if a autosa array is a scalar.  A scalar is a value that is not stored
 * as an array or through a pointer reference, but as a single data element.
 * At the moment, scalars are represented as zero-dimensional arrays.
 * Note that the single data element may be an entire structure.
 */
int autosa_array_is_scalar(struct autosa_array_info *array)
{
  return array->n_index == 0;
}

/* Does "kernel" need to be passed an argument corresponding to array "i"?
 *
 * The argument is only needed if the kernel accesses this device memory.
 */
int autosa_kernel_requires_array_argument(struct autosa_kernel *kernel, int i)
{
  return kernel->array[i].global;
}

/* If group->n_ref == 1, then group->refs was set by
 * populate_array_references to point directly into
 * group->array->refs and should not be freed.
 * If group->n_ref > 1, then group->refs was set by join_groups
 * to point to a newly allocated array.
 */
struct autosa_array_ref_group *autosa_array_ref_group_free(
    struct autosa_array_ref_group *group)
{
  if (!group)
    return NULL;
  autosa_array_tile_free(group->local_tile); // TODO: fix it
  autosa_array_tile_free(group->pe_tile);
  isl_map_free(group->access);
  if (group->n_ref > 1)
    free(group->refs);
  isl_vec_free(group->dir);
  isl_vec_free(group->old_dir);
  isl_multi_aff_free(group->io_trans);
  isl_multi_aff_free(group->io_L1_trans);
  isl_ast_expr_free(group->io_pe_expr);
  isl_ast_expr_free(group->io_L1_pe_expr);
  isl_ast_expr_free(group->io_pe_expr_boundary);
  isl_ast_expr_free(group->io_L1_pe_expr_boundary);
  /* Free io buffers */
  for (int i = 0; i < group->n_io_buffer; i++)
  {        
    autosa_array_tile_free(group->io_buffers[i]->tile);
    isl_union_set_free(group->io_buffers[i]->hoist_domain);
    if (group->io_buffers[i]->tuning_tile) {      
      delete group->io_buffers[i]->tuning_tile;      
    }
    free(group->io_buffers[i]);
  }
  free(group->io_buffers);
  isl_schedule_free(group->io_schedule);
  if (group->io_L1_schedule)
    isl_schedule_free(group->io_L1_schedule);
  isl_schedule_free(group->io_L1_lower_schedule);
  isl_union_pw_multi_aff_free(group->copy_schedule);
  if (group->attached_drain_group)
    autosa_array_ref_group_free(group->attached_drain_group);
  group->tuning_refs.clear();
  delete group->tuning_pe_tile;
  delete group->tuning_local_tile;
  //free(group);
  delete group;

  return NULL;
}

struct autosa_array_ref_group *autosa_array_ref_group_init(
    struct autosa_array_ref_group *group)
{
  group->local_array = NULL;
  group->array = NULL;
  group->nr = -1;
  group->access = NULL;
  group->write = -1;
  group->exact_write = -1;
  group->slice = -1;
  group->min_depth = -1;
  group->shared_tile = NULL;
  group->private_tile = NULL;
  group->local_tile = NULL;
  group->n_ref = 0;
  group->refs = NULL;
  group->io_buffers = NULL;
  group->n_io_buffer = 0;
  group->io_type = AUTOSA_UNKNOWN_IO;
  group->pe_io_dir = IO_UNKNOWN;
  group->array_io_dir = IO_UNKNOWN;
  group->dir = NULL;
  group->old_dir = NULL;
  group->io_trans = NULL;
  group->io_L1_trans = NULL;
  group->io_pe_expr = NULL;
  group->io_L1_pe_expr = NULL;
  group->io_pe_expr_boundary = NULL;
  group->io_L1_pe_expr_boundary = NULL;
  group->io_schedule = NULL;
  group->io_L1_schedule = NULL;
  group->io_L1_lower_schedule = NULL;
  group->io_level = 0;
  group->space_dim = 0;
  group->n_lane = 0;
  group->copy_schedule_dim = 0;
  group->copy_schedule = NULL;
  group->attached_drain_group = NULL;
  group->tuning_pe_tile = NULL;
  group->tuning_local_tile = NULL;

  return group;
}

struct autosa_array_tile *autosa_array_tile_free(struct autosa_array_tile *tile)
{
  int j;

  if (!tile)
    return NULL;

  for (j = 0; j < tile->n; ++j)
  {
    isl_val_free(tile->bound[j].size);
    isl_val_free(tile->bound[j].stride);
    isl_aff_free(tile->bound[j].lb);
    isl_aff_free(tile->bound[j].shift);
  }
  free(tile->bound);
  isl_multi_aff_free(tile->tiling);
  free(tile);

  return NULL;
}

/* Create a autosa_array_tile for an array of dimension "n_index".
 */
struct autosa_array_tile *autosa_array_tile_create(isl_ctx *ctx, int n_index)
{
  int i;
  struct autosa_array_tile *tile;

  tile = isl_calloc_type(ctx, struct autosa_array_tile);
  if (!tile)
    return NULL;

  tile->ctx = ctx;
  tile->bound = isl_alloc_array(ctx, struct autosa_array_bound, n_index);
  if (!tile->bound)
    return autosa_array_tile_free(tile);

  tile->n = n_index;

  for (i = 0; i < n_index; ++i)
  {
    tile->bound[i].size = NULL;
    tile->bound[i].lb = NULL;
    tile->bound[i].stride = NULL;
    tile->bound[i].shift = NULL;
  }

  return tile;
}

/* Compute the size of the tile specified by "tile"
 * in number of elements and return the result.
 */
__isl_give isl_val *autosa_array_tile_size(struct autosa_array_tile *tile)
{
  int i;
  isl_val *size;

  if (!tile)
    return NULL;

  size = isl_val_one(tile->ctx);

  for (i = 0; i < tile->n; ++i)
    size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));

  return size;
}

/****************************************************************
 * AutoSA statement
 ****************************************************************/
static void *free_autosa_io_info(struct autosa_io_info *io_info)
{
  autosa_dep_free(io_info->dep);
  isl_vec_free(io_info->dir);
  isl_vec_free(io_info->old_dir);

  free(io_info);
  return NULL;
}

static void *free_stmts(struct autosa_stmt *stmts, int n)
{
  int i;

  if (!stmts)
    return NULL;

  for (i = 0; i < n; ++i)
  {
    struct autosa_stmt_access *access, *next;

    for (access = stmts[i].accesses; access; access = next)
    {
      next = access->next;
      isl_id_free(access->ref_id);
      isl_map_free(access->access);
      isl_map_free(access->tagged_access);

      for (int k = 0; k < access->n_io_info; k++)
        free_autosa_io_info(access->io_info[k]);
      free(access->io_info);

      free(access);
    }

    isl_id_free(stmts[i].id);
  }
  free(stmts);

  return NULL;
}

/* Has statement "stmt" been killed from "scop"?
 * That is, is the instance set of "scop" free from any
 * instances of "stmt"?
 */
static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt)
{
  isl_space *space;
  isl_set *left;
  isl_bool empty;

  if (!scop || !stmt)
    return isl_bool_error;
  space = isl_set_get_space(stmt->domain);
  left = isl_union_set_extract_set(scop->domain, space);
  empty = isl_set_plain_is_empty(left);
  isl_set_free(left);

  return empty;
}

/* Given a tagged access relation to a single array "tagged", extract it
 * as a map, taking into account that the input may be empty.
 * If the access relation is empty, then it does not contain
 * any space information, so we try to recover it from the index
 * expression.
 * The space of the index expression is of the form I -> A,
 * with I the statement instances and A the array, or [I -> F] -> A,
 * with F the filters corresponding to arguments.
 * We first drop F, if present, obtaining I -> A.
 * Then we construct I -> R, with R the reference tag,
 * combine the two into I -> [R -> A] and uncurry to obtain
 * the final result [I -> R] -> A.
 * Note that the index expression may have a lower dimension
 * than that of the array, but this dimension is not used
 * if the access relation is empty.
 */
static __isl_give isl_map *extract_single_tagged_access(
    __isl_take isl_union_map *tagged, __isl_keep pet_expr *expr)
{
  int empty;
  isl_id *id;
  isl_space *space, *space2;
  isl_multi_pw_aff *index;

  empty = isl_union_map_is_empty(tagged);
  if (empty < 0)
    goto error;
  if (!empty)
    return isl_map_from_union_map(tagged);
  isl_union_map_free(tagged);

  index = pet_expr_access_get_index(expr);
  space = isl_multi_pw_aff_get_space(index);
  isl_multi_pw_aff_free(index);
  if (isl_space_domain_is_wrapping(space))
    space = isl_space_domain_factor_domain(space);
  space2 = isl_space_copy(space);
  space2 = isl_space_from_domain(isl_space_domain(space));
  id = pet_expr_access_get_ref_id(expr);
  space2 = isl_space_set_tuple_id(space2, isl_dim_out, id);
  space = isl_space_range_product(space2, space);
  space = isl_space_uncurry(space);

  return isl_map_empty(space);
error:
  isl_union_map_free(tagged);
  return NULL;
}

/* Does the index expression "index" of "expr" represent an access
 * to a single element?
 * That is, is "index" completely specified?
 *
 * If "expr" accesses elements from different spaces (i.e., fields
 * of a structure), then it does not access a single element.
 * Otherwise, if the single space of the access matches the space
 * of "index", then the index expression is completely specified
 * (no pointer to a lower-dimensional slice of the accessed array)
 * and a single element is being accessed.
 */
static isl_bool complete_index(__isl_keep pet_expr *expr,
                               __isl_keep isl_multi_pw_aff *index)
{
  isl_union_map *read, *write, *all;
  isl_map *map;
  isl_space *space1, *space2;
  isl_bool complete;

  read = pet_expr_access_get_may_read(expr);
  write = pet_expr_access_get_may_write(expr);
  all = isl_union_map_union(read, write);
  if (!all)
    return isl_bool_error;
  if (isl_union_map_n_map(all) != 1)
  {
    isl_union_map_free(all);
    return isl_bool_false;
  }
  map = isl_map_from_union_map(all);
  space1 = isl_map_get_space(map);
  isl_map_free(map);
  space2 = isl_multi_pw_aff_get_space(index);
  complete = isl_space_tuple_is_equal(space1, isl_dim_out,
                                      space2, isl_dim_out);
  isl_space_free(space1);
  isl_space_free(space2);

  return complete;
}

/* Does "expr" access a single, fixed element (independently of the statement
 * instance)?
 * That is, does it have a completely specified constant index expression?
 *
 * Note that it is not sufficient for the index expression to be
 * piecewise constant.  isl_multi_pw_aff_is_cst can therefore not be used.
 */
static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr)
{
  int i, n;
  isl_multi_pw_aff *index;
  isl_bool fixed = isl_bool_true;

  index = pet_expr_access_get_index(expr);
  if (index < 0)
    return isl_bool_error;
  n = isl_multi_pw_aff_dim(index, isl_dim_out);
  for (i = 0; i < n; ++i)
  {
    isl_pw_aff *pa;

    pa = isl_multi_pw_aff_get_pw_aff(index, 0);
    fixed = (isl_pw_aff_n_piece(pa) == 1) ? isl_bool_true : isl_bool_false;
    if (fixed)
      fixed = isl_pw_aff_is_cst(pa);
    isl_pw_aff_free(pa);
    if (fixed < 0 || !fixed)
      break;
  }
  if (fixed >= 0 && fixed)
    fixed = complete_index(expr, index);
  isl_multi_pw_aff_free(index);

  return fixed;
}

/* Extract a autosa_stmt_access from "expr", append it to the list
 * that ends in *data->next_access and update the end of the list.
 * If the access expression performs a write, then it is considered
 * exact only if it appears in a single expression statement and
 * if its may access relation is equal to its must access relation.
 *
 * The combined set of may accesses may be a union if member accesses
 * are involved, but the entire set is derived from a single reference and
 * therefore from a single index expression.  These accesses therefore
 * all map to the same outer array.
 */
static int extract_access(__isl_keep pet_expr *expr, void *user)
{
  struct ppcg_extract_access_data *data = (struct ppcg_extract_access_data *)user;
  isl_union_map *tagged;
  struct autosa_stmt_access *access;
  isl_ctx *ctx = pet_expr_get_ctx(expr);
  isl_multi_pw_aff *index;

  access = isl_alloc_type(ctx, struct autosa_stmt_access);
  if (!access)
    return -1;
  access->next = NULL;
  access->read = pet_expr_access_is_read(expr);
  access->write = pet_expr_access_is_write(expr);
  tagged = pet_expr_access_get_tagged_may_read(expr);
  tagged = isl_union_map_union(tagged,
                               pet_expr_access_get_tagged_may_write(expr));
  tagged = isl_union_map_apply_range(tagged,
                                     isl_union_map_copy(data->any_to_outer));
  if (!access->write)
  {
    access->exact_write = 1;
  }
  else if (!data->single_expression)
  {
    access->exact_write = 0;
  }
  else
  {
    isl_union_map *must, *may;
    may = isl_union_map_copy(tagged);
    may = isl_union_map_domain_factor_domain(may);
    must = pet_expr_access_get_must_write(expr);
    access->exact_write = isl_union_map_is_equal(must, may);
    isl_union_map_free(must);
    isl_union_map_free(may);
  }
  index = pet_expr_access_get_index(expr);
  access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
  isl_multi_pw_aff_free(index);
  access->ref_id = pet_expr_access_get_ref_id(expr);
  access->tagged_access = extract_single_tagged_access(tagged, expr);
  access->access = isl_map_copy(access->tagged_access);
  access->access = isl_map_domain_factor_domain(access->access);
  access->fixed_element = accesses_fixed_element(expr);

  /* AutoSA Extended */
  access->n_io_info = 0;
  access->io_info = NULL;
  access->layout_trans = -1;
  access->simd_dim = -1;
  access->simd_stride = -1;
  /* AutoSA Extended */

  *data->next_access = access;
  data->next_access = &(*data->next_access)->next;

  if (!access->access || access->fixed_element < 0)
    return -1;

  return 0;
}

/* Construct a linked list of autosa_stmt_access objects,
 * one for each access expression in the statement body.
 * "any_to_outer" maps all intermediate arrays to their outer arrays.
 */
static int pet_stmt_extract_accesses(struct autosa_stmt *stmt,
                                     __isl_keep isl_union_map *any_to_outer)
{
  struct ppcg_extract_access_data data;

  stmt->accesses = NULL;
  data.next_access = &stmt->accesses;
  data.single_expression =
      pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
  data.any_to_outer = any_to_outer;
  return pet_tree_foreach_access_expr(stmt->stmt->body,
                                      &extract_access, &data);
}

/* Return an array of autosa_stmt representing the statements in "scop".
 * Do not collect array accesses for statements that have been killed.
 */
struct autosa_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
                                  __isl_keep isl_union_map *any_to_outer)
{
  int i;
  struct autosa_stmt *stmts;

  stmts = isl_calloc_array(ctx, struct autosa_stmt, scop->pet->n_stmt);
  if (!stmts)
    return NULL;

  for (i = 0; i < scop->pet->n_stmt; ++i)
  {
    struct autosa_stmt *s = &stmts[i];
    isl_bool killed;

    s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
    s->stmt = scop->pet->stmts[i];
    killed = is_stmt_killed(scop, scop->pet->stmts[i]);
    if (killed < 0)
      return (struct autosa_stmt *)free_stmts(stmts, i + 1);
    if (killed)
      continue;
    if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
      return (struct autosa_stmt *)free_stmts(stmts, i + 1);
  }

  return stmts;
}

void autosa_kernel_stmt_free(void *user)
{
  struct autosa_kernel_stmt *stmt = (struct autosa_kernel_stmt *)user;

  if (!stmt)
    return;

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_COPY:
    isl_ast_expr_free(stmt->u.c.index);
    isl_ast_expr_free(stmt->u.c.local_index);
    break;
  case AUTOSA_KERNEL_STMT_DOMAIN:
    isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
    break;
  case AUTOSA_KERNEL_STMT_SYNC:
    break;
  case AUTOSA_KERNEL_STMT_IO:
  case AUTOSA_KERNEL_STMT_IO_TRANSFER:
  case AUTOSA_KERNEL_STMT_IO_TRANSFER_BUF:
  case AUTOSA_KERNEL_STMT_IO_DRAM:    
    free(stmt->u.i.in_fifo_name);
    free(stmt->u.i.out_fifo_name);
    isl_ast_expr_free(stmt->u.i.local_index);
    isl_ast_expr_free(stmt->u.i.index);
    free(stmt->u.i.reduce_op);
    break;
  case AUTOSA_KERNEL_STMT_MODULE_CALL:
  case AUTOSA_KERNEL_STMT_EXT_MODULE:
    free(stmt->u.m.module_name);
    break;
  case AUTOSA_KERNEL_STMT_FIFO_DECL:
    break;
  case AUTOSA_KERNEL_STMT_DRAIN_MERGE:
    isl_ast_expr_free(stmt->u.dm.index);
    break;
  case AUTOSA_KERNEL_STMT_HOST_SERIALIZE:
    isl_ast_expr_free(stmt->u.s.index);
    break;
  }

  free(stmt);
}

/* Find the element in gen->stmt that has the given "id".
 * Return NULL if no such autosa_stmt can be found.
 */
struct autosa_stmt *find_stmt(struct autosa_prog *prog, __isl_keep isl_id *id)
{
  int i;

  for (i = 0; i < prog->n_stmts; ++i)
  {
    if (id == prog->stmts[i].id)
      break;
  }

  return i < prog->n_stmts ? &prog->stmts[i] : NULL;
}

/****************************************************************
 * AutoSA prog
 ****************************************************************/
/* Compute the set of inner array elements that may have their values
 * preserved by "prog".  In particular, collect the array elements of
 * arrays that are not local to "prog" and remove those elements that
 * are definitely killed or definitely written by "prog".
 */
static __isl_give isl_union_set *compute_may_persist(struct autosa_prog *prog)
{
  int i;
  isl_union_set *may_persist, *killed;
  isl_union_map *must_kill;

  may_persist = isl_union_set_empty(isl_set_get_space(prog->context));
  for (i = 0; i < prog->n_array; ++i)
  {
    isl_set *extent;

    if (prog->array[i].local)
      continue;

    extent = isl_set_copy(prog->array[i].extent);
    may_persist = isl_union_set_add_set(may_persist, extent);
  }

  may_persist = isl_union_set_intersect_params(may_persist,
                                               isl_set_copy(prog->context));
  may_persist = isl_union_set_apply(may_persist,
                                    isl_union_map_copy(prog->to_inner));
  must_kill = isl_union_map_copy(prog->tagged_must_kill);
  killed = isl_union_map_range(must_kill);
  must_kill = isl_union_map_copy(prog->must_write);
  killed = isl_union_set_union(killed, isl_union_map_range(must_kill));

  may_persist = isl_union_set_subtract(may_persist, killed);
  return may_persist;
}

struct autosa_prog *autosa_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
{
  struct autosa_prog *prog;
  isl_space *space;
  isl_map *id;

  if (!scop)
    return NULL;

  prog = isl_calloc_type(ctx, struct autosa_prog);
  if (!prog)
    return NULL;

  prog->ctx = ctx;
  prog->scop = scop;
  prog->context = isl_set_copy(scop->context);
  prog->n_stmts = scop->pet->n_stmt;
  prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
  prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
  space = isl_union_map_get_space(prog->any_to_outer);
  space = isl_space_set_from_params(space);
  space = isl_space_add_dims(space, isl_dim_set, 1);
  space = isl_space_map_from_set(space);
  id = isl_map_identity(space);
  prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
  prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer);
  prog->read = isl_union_map_copy(scop->reads);
  prog->may_write = isl_union_map_copy(scop->may_writes);
  prog->must_write = isl_union_map_copy(scop->must_writes);
  prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills);
  prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
  prog->to_outer = isl_union_map_copy(prog->to_inner);
  prog->to_outer = isl_union_map_reverse(prog->to_outer);

  if (!prog->stmts)
    return (struct autosa_prog *)autosa_prog_free(prog);

  if (collect_array_info(prog) < 0)
    return (struct autosa_prog *)autosa_prog_free(prog);
  prog->may_persist = compute_may_persist(prog); // TODO

  return prog;
}

void *autosa_prog_free(struct autosa_prog *prog)
{
  if (!prog)
    return NULL;
  free_array_info(prog);
  free_stmts(prog->stmts, prog->n_stmts);
  isl_union_map_free(prog->any_to_outer);
  isl_union_map_free(prog->to_outer);
  isl_union_map_free(prog->to_inner);
  isl_union_map_free(prog->read);
  isl_union_map_free(prog->may_write);
  isl_union_map_free(prog->must_write);
  isl_union_map_free(prog->tagged_must_kill);
  isl_union_map_free(prog->array_order);
  isl_union_set_free(prog->may_persist);
  isl_set_free(prog->context);
  free(prog);

  return NULL;
}

/****************************************************************
 * AutoSA hw module
 ****************************************************************/
struct autosa_hw_module *autosa_hw_module_alloc(struct autosa_gen *gen)
{
  struct autosa_hw_module *module = (struct autosa_hw_module *)malloc(
      sizeof(struct autosa_hw_module));
  module->options = gen->options;
  module->name = NULL;
  module->tree = NULL;
  module->device_tree = NULL;
  module->inst_ids = NULL;
  module->n_var = 0;
  module->var = NULL;
  module->kernel = NULL;
  module->n_io_group = 0;
  module->io_groups = NULL;
  module->to_pe = 0;
  module->to_mem = 0;
  module->double_buffer = 0;
  module->is_filter = 0;
  module->is_buffer = 0;
  module->outer_sched = NULL;
  module->inter_sched = NULL;
  module->intra_sched = NULL;
  module->inter_space = NULL;
  module->intra_space = NULL;
  module->space = NULL;
  module->inter_tree = NULL;
  module->intra_tree = NULL;
  module->credit = 0;
  module->boundary_sched = NULL;
  module->boundary_tree = NULL;
  module->boundary = 0;
  module->boundary_outer_sched = NULL;
  module->boundary_inter_sched = NULL;
  module->boundary_outer_tree = NULL;
  module->boundary_inter_tree = NULL;
  module->n_pe_dummy_modules = 0;
  module->pe_dummy_modules = NULL;
  module->n_array_ref = 0;
  module->serialize_sched = NULL;
  module->serialize_tree = NULL;
  module->coalesce_bound = -1;
  module->is_serialized = 0;
  module->use_FF = 0;
  module->in = -1;
  module->pipeline_at_default_func = 0;
  module->pipeline_at_filter_func[0] = 0;
  module->pipeline_at_filter_func[1] = 0;
  module->pipeline_at_filter_func[2] = 0;

  module->n_fifo_serialize = 0;
  module->fifo_bounds_serialize = NULL;
  module->fifo_names_serialize = NULL;
  module->n_fifo_default = 0;
  module->fifo_names_default = NULL;
  module->fifo_bounds_default = NULL;
  module->n_fifo_inter = 0;
  module->fifo_names_inter = NULL;
  module->fifo_bounds_inter = NULL;
  module->n_fifo_intra = 0;
  module->fifo_names_intra = NULL;
  module->fifo_bounds_intra = NULL;

  module->tuning_sched = NULL;
  module->tuning_outer_sched = NULL;
  module->tuning_inter_sched = NULL;
  module->tuning_intra_sched = NULL;
  module->tuning_tree = NULL;
  module->tuning_device_tree = NULL;
  module->tuning_intra_tree = NULL;
  module->tuning_inter_tree = NULL;

  module->tuning_num_sched = NULL;
  module->tuning_num_outer_sched = NULL;
  module->tuning_num_inter_sched = NULL;
  module->tuning_num_intra_sched = NULL;
  module->tuning_num_tree = NULL;
  module->tuning_num_device_tree = NULL;
  module->tuning_num_intra_tree = NULL;
  module->tuning_num_inter_tree = NULL;  

  return module;
}

void *autosa_hw_module_free(struct autosa_hw_module *module)
{
  if (!module)
    return NULL;

  free(module->name);

  isl_ast_node_free(module->tree);
  isl_ast_node_free(module->device_tree);
  isl_ast_node_free(module->inter_tree);
  isl_ast_node_free(module->intra_tree);
  isl_ast_node_free(module->boundary_tree);
  isl_ast_node_free(module->boundary_outer_tree);
  isl_ast_node_free(module->boundary_inter_tree);
  isl_ast_node_free(module->serialize_tree);

  isl_space_free(module->inter_space);
  isl_space_free(module->intra_space);
  isl_space_free(module->space);

  isl_id_list_free(module->inst_ids);
  for (int i = 0; i < module->n_var; i++)
  {
    free(module->var[i].name);
    isl_vec_free(module->var[i].size);
  }
  free(module->var);
  free(module->io_groups);  
  for (int i = 0; i < module->n_pe_dummy_modules; i++)
  {
    autosa_pe_dummy_module_free(module->pe_dummy_modules[i]);
  }
  free(module->pe_dummy_modules);

  if (module->n_fifo_serialize > 0) {
    for (int i = 0; i < module->n_fifo_serialize; i++) {
      free(module->fifo_names_serialize[i]);
      isl_pw_qpolynomial_free(module->fifo_bounds_serialize[i]);
    }
    free(module->fifo_bounds_serialize);
    free(module->fifo_names_serialize);
  }
  if (module->n_fifo_default > 0) {
    for (int i = 0; i < module->n_fifo_default; i++) {
      free(module->fifo_names_default[i]);
      isl_pw_qpolynomial_free(module->fifo_bounds_default[i]);
    }
    free(module->fifo_bounds_default);
    free(module->fifo_names_default);
  }
  if (module->n_fifo_inter > 0) {
    for (int i = 0; i < module->n_fifo_inter; i++) {
      free(module->fifo_names_inter[i]);
      isl_pw_qpolynomial_free(module->fifo_bounds_inter[i]);
    }
    free(module->fifo_bounds_inter);
    free(module->fifo_names_inter);
  }
  if (module->n_fifo_intra > 0) {
    for (int i = 0; i < module->n_fifo_intra; i++) {
      free(module->fifo_names_intra[i]);
      isl_pw_qpolynomial_free(module->fifo_bounds_intra[i]);
    }
    free(module->fifo_bounds_intra);
    free(module->fifo_names_intra);
  }

  isl_ast_node_free(module->tuning_tree);
  isl_ast_node_free(module->tuning_device_tree);
  isl_ast_node_free(module->tuning_inter_tree);
  isl_ast_node_free(module->tuning_intra_tree);

  isl_ast_node_free(module->tuning_num_tree);
  isl_ast_node_free(module->tuning_num_device_tree);
  isl_ast_node_free(module->tuning_num_inter_tree);
  isl_ast_node_free(module->tuning_num_intra_tree);

  free(module);

  return NULL;
}

struct autosa_hw_top_module *autosa_hw_top_module_alloc()
{
  struct autosa_hw_top_module *module = (struct autosa_hw_top_module *)malloc(
      sizeof(struct autosa_hw_top_module));

  module->n_module_calls = 0;
  module->n_fifo_decls = 0;
  module->module_call_scheds = NULL;
  module->fifo_decl_scheds = NULL;
  module->module_call_trees = NULL;
  module->fifo_decl_trees = NULL;
  module->fifo_decl_names = NULL;

  module->n_module_call_wrapped = 0;
  module->n_fifo_decl_wrapped = 0;
  module->module_call_wrapped_trees = NULL;
  module->fifo_decl_wrapped_trees = NULL;

  module->kernel = NULL;
  module->hw_modules = NULL;
  module->n_hw_modules = 0;

  module->n_ext_module = 0;
  module->ext_module_scheds = NULL;
  module->ext_module_trees = NULL;
  module->n_ext_module_wrapped = 0;
  module->ext_module_wrapped_trees = NULL;

  return module;
}

void *autosa_hw_top_module_free(struct autosa_hw_top_module *module)
{
  if (!module)
    return NULL;

  if (module->module_call_trees)
  {
    for (int i = 0; i < module->n_module_calls; i++)
    {
      isl_ast_node_free(module->module_call_trees[i]);
    }
  }

  if (module->fifo_decl_trees)
  {
    for (int i = 0; i < module->n_fifo_decls; i++)
    {
      isl_ast_node_free(module->fifo_decl_trees[i]);
      free(module->fifo_decl_names[i]);
    }
  }

  if (module->module_call_wrapped_trees)
  {
    for (int i = 0; i < module->n_module_call_wrapped; i++)
    {
      isl_ast_node_free(module->module_call_wrapped_trees[i]);
    }
  }

  if (module->fifo_decl_wrapped_trees)
  {
    for (int i = 0; i < module->n_fifo_decl_wrapped; i++)
    {
      isl_ast_node_free(module->fifo_decl_wrapped_trees[i]);
    }
  }

  if (module->ext_module_trees)
  {
    for (int i = 0; i < module->n_ext_module; i++)
    {
      isl_ast_node_free(module->ext_module_trees[i]);
    }
  }

  if (module->ext_module_wrapped_trees)
  {
    for (int i = 0; i < module->n_ext_module_wrapped; i++)
    {
      isl_ast_node_free(module->ext_module_wrapped_trees[i]);
    }
  }

  free(module->module_call_scheds);
  free(module->fifo_decl_scheds);
  free(module->ext_module_scheds);
  free(module->module_call_trees);
  free(module->fifo_decl_trees);
  free(module->ext_module_trees);
  free(module->module_call_wrapped_trees);
  free(module->fifo_decl_wrapped_trees);
  free(module->ext_module_wrapped_trees);
  free(module->fifo_decl_names);
  free(module);

  return NULL;
}

struct autosa_pe_dummy_module *autosa_pe_dummy_module_alloc()
{
  struct autosa_pe_dummy_module *module = (struct autosa_pe_dummy_module *)malloc(
      sizeof(struct autosa_pe_dummy_module));
  module->module = NULL;
  module->io_group = NULL;
  module->sched = NULL;
  module->tree = NULL;
  module->device_tree = NULL;

  return module;
}

void *autosa_pe_dummy_module_free(struct autosa_pe_dummy_module *module)
{
  if (!module)
    return NULL;

  isl_ast_node_free(module->tree);
  isl_ast_node_free(module->device_tree);
  free(module);

  return NULL;
}

struct autosa_drain_merge_func *autosa_drain_merge_func_alloc(struct autosa_gen *gen)
{
  struct autosa_drain_merge_func *func = (struct autosa_drain_merge_func *)
      malloc(sizeof(struct autosa_drain_merge_func));
  func->group = NULL;
  func->kernel = NULL;
  func->inst_ids = NULL;
  func->sched = NULL;
  func->tree = NULL;
  func->device_tree = NULL;

  return func;
}

void *autosa_drain_merge_func_free(struct autosa_drain_merge_func *func)
{
  if (!func)
    return NULL;

  isl_id_list_free(func->inst_ids);
  isl_ast_node_free(func->tree);
  isl_ast_node_free(func->device_tree);
  free(func);

  return NULL;
}

/****************************************************************
 * AutoSA AST node
 ****************************************************************/
struct autosa_ast_node_userinfo *alloc_ast_node_userinfo()
{
  struct autosa_ast_node_userinfo *info =
      (struct autosa_ast_node_userinfo *)malloc(sizeof(
          struct autosa_ast_node_userinfo));
  info->is_pipeline = 0;
  info->is_unroll = 0;
  info->is_outermost_for = 0;
  info->is_infinitize_legal = 0;
  info->is_first_infinitizable_loop = 0;  
  info->is_dep_free = 0;
  info->n_coalesce_loop = 0;
  info->visited = 0;

  info->is_guard_start = 0;
  info->is_guard_end = 0;
  info->n_fifo = 0;
  info->fifo_names = NULL;
  info->bounds = NULL;
  info->module_name = NULL;

  return info;
}

void free_ast_node_userinfo(void *ptr)
{
  struct autosa_ast_node_userinfo *info = (struct autosa_ast_node_userinfo *)ptr;  

  free(info);
}

/****************************************************************
 * AutoSA PE opt
 ****************************************************************/
/* Internal data structure for extract_size_of_type.
 * "type" specifies the name of the space that we want to extract.
 * "res" is used to store the subset of that space.
 */
struct autosa_extract_size_data
{
  const char *type;
  isl_set *res;
};

/* This function is called for each set in a union_set.
 * If the name of the set matches data->type, we store the
 * set in data->res.
 */
static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user)
{
  struct autosa_extract_size_data *data = (struct autosa_extract_size_data *)user;
  const char *name;

  name = isl_set_get_tuple_name(size);
  if (name && !strcmp(name, data->type))
  {
    data->res = size;
    return isl_stat_error;
  }

  isl_set_free(size);
  return isl_stat_ok;
}

/* Given a union map { kernel[] -> *[...] },
 * return the range in the space called "type" for the kernel with 
 * sequence number "id".
 */
__isl_give isl_set *extract_sa_sizes(__isl_keep isl_union_map *sizes,
                                     const char *type)
{
  isl_space *space;
  isl_set *dom;
  isl_union_set *local_sizes;
  struct autosa_extract_size_data data = {type, NULL};

  if (!sizes)
    return NULL;

  space = isl_union_map_get_space(sizes);
  space = isl_space_set_from_params(space);
  //space = isl_space_add_dims(space, isl_dim_set, 1);
  space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
  dom = isl_set_universe(space);
  //dom = isl_set_fix_si(dom, isl_dim_set, 0, id);

  local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
                                    isl_union_map_copy(sizes));
  isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
  isl_union_set_free(local_sizes);
  return data.res;
}

/* Given a singleton set, extract the *len elements of the single integer tuple
 * into *sizes. 
 *
 * If the element value is "-1", the loop at the same position is not tiled.
 *  
 * If "set" is NULL, then the "sizes" array is not updated.
 */
static isl_stat read_sa_sizes_from_set(__isl_take isl_set *set, int *sizes, int len)
{
  int i;
  int dim;

  if (!set)
    return isl_stat_ok;

  dim = isl_set_dim(set, isl_dim_set);
  if (dim < len)
    isl_die(isl_set_get_ctx(set), isl_error_invalid,
            "fewer sa_sizes than required", return isl_stat_error);

  for (i = 0; i < len; ++i)
  {
    isl_val *v;

    v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
    if (!v)
      goto error;    
    sizes[i] = isl_val_get_num_si(v);    
    isl_val_free(v);
  }

  isl_set_free(set);
  return isl_stat_ok;
error:
  isl_set_free(set);
  return isl_stat_error;
}

/* Given a union map { kernel[] -> *[...] },
 * return the range in the space called "type" for the kernel.
 */
static __isl_give isl_set *extract_config_sizes(__isl_keep isl_union_map *sizes,
  const char *type)
{
  isl_space *space;
  isl_set *dom;
  isl_union_set *local_sizes;
  struct autosa_extract_size_data data = {type, NULL};

  if (!sizes)
    return NULL;
  
  space = isl_union_map_get_space(sizes);
  space = isl_space_set_from_params(space);
  //space = isl_space_add_dims(space, isl_dim_set, 1);
  space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
  dom = isl_set_universe(space);
//#ifdef _DEBUG
//  isl_printer *pd = isl_printer_to_file(isl_set_get_ctx(dom), stdout);
//  pd = isl_printer_print_set(pd, dom);
//  pd = isl_printer_end_line(pd);
//#endif

  local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
                                    isl_union_map_copy(sizes));

//#ifdef _DEBUG
//  pd = isl_printer_print_union_set(pd, local_sizes);
//  pd = isl_printer_end_line(pd);
//#endif
  isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);                                      
  isl_union_set_free(local_sizes);
  return data.res;
}

/* Given a singleton set, extract the *len elements of the single integer tuple
 * into *sizes. 
 *
 * If the element value is "-1", the loop at the same position is not tiled.
 *  
 * If "set" is NULL, then the "sizes" array is not updated.
 */
static isl_stat read_config_sizes_from_set(__isl_take isl_set *set, 
  int *sizes, int len)
{
  int i;
  int dim;

  if (!set)
    return isl_stat_ok;

  dim = isl_set_dim(set, isl_dim_set);
  if (dim < len)
    isl_die(isl_set_get_ctx(set), isl_error_invalid,
            "fewer sizes than required", return isl_stat_error);

  for (i = 0; i < len; ++i)
  {
    isl_val *v;

    v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
    if (!v)
      goto error;
    sizes[i] = isl_val_get_num_si(v);
    isl_val_free(v);
  }

  isl_set_free(set);
  return isl_stat_ok;
error:
  isl_set_free(set);
  return isl_stat_error;
}

/* Add the map { kernel[id] -> type[sizes] } to gen->used-sizes 
 * if the option debug->dump_sa_sizes is set.
 */
static void set_sa_used_sizes(struct autosa_kernel *sa, const char *type, int id,
                              int *sizes, int len)
{
  // TODO: fix it
}

/* Extract user specified "sa_tile" sizes from the "sa_sizes" command line options,
 * defaulting to option->sa_tile_size in each dimension.
 * *tile_len contains the maximum number of tile sizes needed.
 * Update *tile_len to the number of specified tile sizes, if any, and
 * return a pointer to the tile sizes (or NULL on error).
 * And the effectively used sizes to sa->used_sizes.
 */
int *read_hbm_tile_sizes(struct autosa_kernel *sa, int tile_len, char *name)
{
  int n;
  int *tile_size;
  isl_set *size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_sa_sizes(sa->sizes, name);
  if (isl_set_dim(size, isl_dim_set) < tile_len)
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_sa_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;
  set_sa_used_sizes(sa, name, sa->id, tile_size, tile_len);

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

int read_mem_port_map(__isl_keep isl_union_map *port_map, char *name)
{
  isl_set *size;
  int port;

  size = extract_sa_sizes(port_map, name);
  if (isl_set_dim(size, isl_dim_set) != 1) {
    isl_set_free(size);
    return -1;
  }
  if (read_sa_sizes_from_set(size, &port, 1) < 0)
    goto error;
  
  return port;
error:
  return -1;
}

int *read_default_hbm_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;
  for (n = 0; n < tile_len; ++n)
    tile_size[n] = sa->scop->options->autosa->n_hbm_port;

  return tile_size;
}

/* Extract user specified data pack sizes for array "name".
 */
int *read_data_pack_sizes_array(__isl_keep isl_union_map *sizes, char *name)
{
  isl_set *size;
  int *data_pack_sizes;
  
  size = extract_sa_sizes(sizes, name);
  if (isl_set_dim(size, isl_dim_set) != 3) {
    isl_set_free(size);
    return NULL;
  }
  data_pack_sizes = (int *)malloc(3 * sizeof(int));
  if (read_sa_sizes_from_set(size, data_pack_sizes, 3) < 0)
    goto error;

  return data_pack_sizes;
error:
  free(data_pack_sizes);
  return NULL;
}

/* Extract user specified data pack sizes from the "data_pack_sizes" command line
 * option, defaulting to 8, 32, 64, correponding to the upper bounds of data 
 * pack factors at the innermost, in-between, and outermost I/O module levels.
 * Return a pointer to the tile sizes (or NULL on error).
 */
int *read_data_pack_sizes(__isl_keep isl_union_map *sizes, int tile_len)
{
  int n;
  int *tile_size;
  isl_set *size;
  isl_ctx *ctx;

  ctx = isl_union_map_get_ctx(sizes);
  tile_size = isl_alloc_array(ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_config_sizes(sizes, "data_pack");
//#ifdef _DEBUG
//  isl_printer *pd = isl_printer_to_file(ctx, stdout);
//  pd = isl_printer_print_union_map(pd, sizes);
//  pd = isl_printer_end_line(pd);
//  if (!size)
//    printf("null\n");
//  pd = isl_printer_print_set(pd, size);
//  pd = isl_printer_end_line(pd);
//  isl_printer_free(pd);
//#endif
  
  if (isl_set_dim(size, isl_dim_set) < tile_len) 
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_config_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

/* Extract user specified "sa_tile" sizes from the "sa_sizes" command line option,
 * defaulting to option->sa_tile_size in each dimension.
 * *tile_len contains the maximum number of tile sizes needed.
 * Update *tile_len to the number of specified tile sizes, if any, and 
 * return a pointer to the tile sizes (or NULL on error).
 * And the effectively used sizes to sa->used_sizes.
 */
int *read_array_part_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;
  isl_set *size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_sa_sizes(sa->sizes, "array_part");
  if (isl_set_dim(size, isl_dim_set) < tile_len)
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_sa_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;
  set_sa_used_sizes(sa, "array_part", sa->id, tile_size, tile_len);

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

int *read_default_array_part_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;
  for (n = 0; n < tile_len; ++n)
    tile_size[n] = sa->scop->options->autosa->sa_tile_size;

  return tile_size;
}

/* Extract user specified "sa_tile" sizes from the "sa_sizes" command line option,
 * defaulting to option->sa_tile_size in each dimension.
 * *tile_len contains the maximum number of tile sizes needed.
 * Update *tile_len to the number of specified tile sizes, if any, and
 * return a pointer to the tile sizes (or NULL on error).
 * And store the effectively used sizes to sa->used_sizes.
 */
int *read_latency_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;
  isl_set *size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_sa_sizes(sa->sizes, "latency");
  if (isl_set_dim(size, isl_dim_set) < tile_len)
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_sa_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;
  set_sa_used_sizes(sa, "latency", sa->id, tile_size, tile_len);

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

int *read_default_latency_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;
  for (n = 0; n < tile_len; ++n)
    tile_size[n] = sa->scop->options->autosa->sa_tile_size / 2;

  return tile_size;
}

int *read_simd_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;
  isl_set *size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_sa_sizes(sa->sizes, "simd");
  if (isl_set_dim(size, isl_dim_set) < tile_len)
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_sa_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;
  set_sa_used_sizes(sa, "simd", sa->id, tile_size, tile_len);

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

int *read_default_simd_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;
  for (n = 0; n < tile_len; ++n)
    tile_size[n] = sa->scop->options->autosa->sa_tile_size / 2;

  return tile_size;
}

int read_space_time_kernel_id(__isl_keep isl_union_map *sizes)
{
  isl_set *size;
  int kernel_id;
  int dim;
  size = extract_sa_sizes(sizes, "space_time");
  if (!size)
    return -1;
  dim = isl_set_dim(size, isl_dim_set);
  if (dim == 0)
    return -1;
  else
  {
    read_sa_sizes_from_set(size, &kernel_id, 1);
    return kernel_id;
  }
}

int *read_array_part_L2_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;
  isl_set *size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;

  size = extract_sa_sizes(sa->sizes, "array_part_L2");
  if (isl_set_dim(size, isl_dim_set) < tile_len)
  {
    free(tile_size);
    isl_set_free(size);
    return NULL;
  }
  if (read_sa_sizes_from_set(size, tile_size, tile_len) < 0)
    goto error;
  set_sa_used_sizes(sa, "array_part_L2", sa->id, tile_size, tile_len);

  return tile_size;
error:
  free(tile_size);
  return NULL;
}

int *read_default_array_part_L2_tile_sizes(struct autosa_kernel *sa, int tile_len)
{
  int n;
  int *tile_size;

  tile_size = isl_alloc_array(sa->ctx, int, tile_len);
  if (!tile_size)
    return NULL;
  for (n = 0; n < tile_len; ++n)
    tile_size[n] = sa->scop->options->autosa->sa_tile_size;

  return tile_size;
}

/****************************************************************
 * AutoSA latency and resource estimation
 ****************************************************************/
struct extract_loop_info_data
{
  cJSON *loop_struct;
};

/* Extract the loop info containing: iterator, lower bound,
 * upper bound, and stride.
 * Return the pointer to the loop child.
 */
static cJSON *extract_isl_ast_node_for(__isl_keep isl_ast_node *node, cJSON *loop,
                                       isl_bool degenerate)
{
  cJSON *loop_info = cJSON_CreateObject();
  cJSON *loop_child = cJSON_CreateObject();
  isl_printer *p_str = NULL;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  char *str = NULL;

  /* Extract the loop info */
  isl_ast_expr *init, *cond, *inc, *iterator, *arg;
  init = isl_ast_node_for_get_init(node);
  cond = isl_ast_node_for_get_cond(node);
  inc = isl_ast_node_for_get_inc(node);
  iterator = isl_ast_node_for_get_iterator(node);

  /* iterator */
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
  p_str = isl_printer_print_ast_expr(p_str, iterator);
  str = isl_printer_get_str(p_str);
  cJSON_AddStringToObject(loop_info, "iter", str);
  isl_printer_free(p_str);
  free(str);
  isl_ast_expr_free(iterator);

  /* lower bound */
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
  p_str = isl_printer_print_ast_expr(p_str, init);
  str = isl_printer_get_str(p_str);
  cJSON_AddStringToObject(loop_info, "lb", str);
  isl_printer_free(p_str);
  free(str);
  isl_ast_expr_free(init);

  if (!degenerate)
  {
    /* upper bound */
    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
    arg = isl_ast_expr_op_get_arg(cond, 1);
    p_str = isl_printer_print_ast_expr(p_str, arg);
    str = isl_printer_get_str(p_str);
    cJSON_AddStringToObject(loop_info, "ub", str);
    isl_printer_free(p_str);
    free(str);
    isl_ast_expr_free(arg);

    /* stride */
    p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
    p_str = isl_printer_print_ast_expr(p_str, inc);
    str = isl_printer_get_str(p_str);
    cJSON_AddStringToObject(loop_info, "stride", str);
    isl_printer_free(p_str);
    free(str);
  }
  else
  {
    const cJSON *lb;

    lb = cJSON_GetObjectItemCaseSensitive(loop_info, "lb");
    cJSON_AddStringToObject(loop_info, "ub", lb->valuestring);
    cJSON_AddStringToObject(loop_info, "stride", "1");
  }
  isl_ast_expr_free(cond);
  isl_ast_expr_free(inc);

  cJSON_AddItemToObject(loop, "loop_info", loop_info);
  cJSON_AddItemToObject(loop, "child", loop_child);

  return loop_child;
}

static cJSON *extract_isl_ast_node_block(__isl_keep isl_ast_node *node, cJSON *block)
{
  cJSON *block_child = cJSON_CreateArray();
  cJSON_AddItemToObject(block, "child", block_child);

  return block_child;
}

static cJSON *extract_isl_ast_node_mark(__isl_keep isl_ast_node *node, cJSON *mark)
{
  cJSON *mark_child = cJSON_CreateObject();
  isl_id *id = isl_ast_node_mark_get_id(node);
  char *name = (char *)isl_id_get_name(id);
  isl_id_free(id);
  cJSON_AddStringToObject(mark, "mark_name", name);
  cJSON_AddItemToObject(mark, "child", mark_child);

  return mark_child;
}

static cJSON *extract_isl_ast_node_user(__isl_keep isl_ast_node *node, cJSON *user)
{
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  isl_ast_expr *expr = isl_ast_node_user_get_expr(node);
  isl_printer *p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
  p_str = isl_printer_print_ast_expr(p_str, expr);
  char *user_expr = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  cJSON_AddStringToObject(user, "user_expr", user_expr);
  free(user_expr);
  isl_ast_expr_free(expr);

  return user;
}

static cJSON *extract_loop_info_at_ast_node(__isl_keep isl_ast_node *node,
                                            cJSON *loop_struct)
{
  enum isl_ast_node_type type;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  type = isl_ast_node_get_type(node);

  switch (type)
  {
  case isl_ast_node_for:
  {
    isl_bool degenerate = isl_ast_node_for_is_degenerate(node);
    /* Extract the loop information and insert it into the loop struct */
    cJSON *loop = cJSON_CreateObject();
    cJSON *loop_child = extract_isl_ast_node_for(node, loop, degenerate);
    if (cJSON_IsObject(loop_struct))
    {
      cJSON_AddItemToObject(loop_struct, "loop", loop);
    }
    else if (cJSON_IsArray(loop_struct))
    {
      cJSON *item = cJSON_CreateObject();
      cJSON_AddItemToObject(item, "loop", loop);
      cJSON_AddItemToArray(loop_struct, item);
    }
    isl_ast_node *child_node;
    /* Update the JSON pointer */
    child_node = isl_ast_node_for_get_body(node);
    extract_loop_info_at_ast_node(child_node, loop_child);
    isl_ast_node_free(child_node);

    break;
  }
  case isl_ast_node_block:
  {
    /* Extract the block information and insert it into the loop struct */
    isl_ast_node_list *child_list = isl_ast_node_block_get_children(node);
    int n_child = isl_ast_node_list_n_ast_node(child_list);
    cJSON *block = cJSON_CreateObject();
    cJSON *block_child = extract_isl_ast_node_block(node, block);
    if (cJSON_IsObject(loop_struct))
    {
      cJSON_AddItemToObject(loop_struct, "block", block);
    }
    else if (cJSON_IsArray(loop_struct))
    {
      cJSON *item = cJSON_CreateObject();
      cJSON_AddItemToObject(item, "block", block);
      cJSON_AddItemToArray(loop_struct, item);
    }

    isl_ast_node *child_node;
    for (int i = 0; i < n_child; i++)
    {
      cJSON *child_struct;
      child_node = isl_ast_node_list_get_ast_node(child_list, i);
      extract_loop_info_at_ast_node(child_node, block_child);
      isl_ast_node_free(child_node);
    }
    isl_ast_node_list_free(child_list);

    break;
  }
  case isl_ast_node_user:
  {
    /* Extract the user information and insert it into the loop struct */
    cJSON *user = cJSON_CreateObject();
    user = extract_isl_ast_node_user(node, user);

    if (cJSON_IsObject(loop_struct))
    {
      cJSON_AddItemToObject(loop_struct, "user", user);
    }
    else if (cJSON_IsArray(loop_struct))
    {
      cJSON *item = cJSON_CreateObject();
      cJSON_AddItemToObject(item, "user", user);
      cJSON_AddItemToArray(loop_struct, item);
    }

    break;
  }
  case isl_ast_node_if:
  {
    cJSON *if_struct = cJSON_CreateObject();
    cJSON *then_struct = cJSON_CreateObject();
    cJSON *else_struct = NULL;
    if (cJSON_IsObject(loop_struct))
    {
      cJSON_AddItemToObject(loop_struct, "if", if_struct);
    }
    else if (cJSON_IsArray(loop_struct))
    {
      cJSON *item = cJSON_CreateObject();
      cJSON_AddItemToObject(item, "if", if_struct);
      cJSON_AddItemToArray(loop_struct, item);
    }

    isl_ast_node *child_node;
    child_node = isl_ast_node_if_get_then_node(node);
    cJSON_AddItemToObject(if_struct, "then", then_struct);
    extract_loop_info_at_ast_node(child_node, then_struct);
    isl_ast_node_free(child_node);

    child_node = isl_ast_node_if_get_else_node(node);
    if (child_node)
    {
      else_struct = cJSON_CreateObject();
      cJSON_AddItemToObject(if_struct, "else", else_struct);
      extract_loop_info_at_ast_node(child_node, else_struct);
      isl_ast_node_free(child_node);
    }

    break;
  }
  case isl_ast_node_mark:
  {
    /* Extract the mark id and insert it into the loop struct */
    cJSON *mark = cJSON_CreateObject();
    cJSON *mark_child = extract_isl_ast_node_mark(node, mark);
    if (cJSON_IsObject(loop_struct))
    {
      cJSON_AddItemToObject(loop_struct, "mark", mark);
    }
    else if (cJSON_IsArray(loop_struct))
    {
      cJSON *item = cJSON_CreateObject();
      cJSON_AddItemToObject(item, "mark", mark);
      cJSON_AddItemToArray(loop_struct, item);
    }

    isl_ast_node *child_node;
    child_node = isl_ast_node_mark_get_node(node);
    extract_loop_info_at_ast_node(child_node, mark_child);
    isl_ast_node_free(child_node);

    break;
  }
  default:
    break;
  }

  return NULL;
}

/* Extract the loop structure and detailed information of the hardware module into 
 * a JSON struct. If "print" is set, we will print out the JSON file. 
 * Otherwise, return it as a string.
 */
static char *extract_loop_info_from_module(
    struct autosa_gen *gen, __isl_keep isl_ast_node *tree,
    char *module_name, int double_buffer, int in,
    int print)
{
  if (!tree)
    return NULL;

  cJSON *loop_struct = cJSON_CreateObject();
  cJSON *module_props = cJSON_CreateObject();
  char *json_str = NULL;

  cJSON_AddStringToObject(loop_struct, "module_name", module_name);
  cJSON_AddNumberToObject(module_props, "double_buffer", double_buffer);  
  cJSON_AddNumberToObject(module_props, "in", in);
  cJSON_AddItemToObject(loop_struct, "module_prop", module_props);
  
  extract_loop_info_at_ast_node(tree, loop_struct);

  /* Print the JSON file */
  json_str = cJSON_Print(loop_struct);

  if (!print)
  {
    cJSON_Delete(loop_struct);
    return json_str;
  }
  else
  {
    char *file_name;
    FILE *fp;
    isl_printer *p_str;
    const cJSON *module_name = NULL;

    module_name = cJSON_GetObjectItemCaseSensitive(loop_struct, "module_name");
    p_str = isl_printer_to_str(gen->ctx);
    p_str = isl_printer_print_str(p_str, gen->options->autosa->output_dir);
    p_str = isl_printer_print_str(p_str, "/latency_est/");
    p_str = isl_printer_print_str(p_str, module_name->valuestring);
    p_str = isl_printer_print_str(p_str, "_loop_info.json");
    file_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);
    cJSON_Delete(loop_struct);

    fp = fopen(file_name, "w");
    if (!fp)
    {
      printf("[AutoSA] Error: Cannot open file: %s\n", file_name);
      exit(1);
    }
    free(file_name);
    fprintf(fp, "%s", json_str);
    fclose(fp);
    free(json_str);

    return NULL;
  }
}

/* Extract the loop structure and detailed information of the hardware module into 
 * a JSON struct.
 */
isl_stat sa_extract_loop_info(struct autosa_gen *gen, struct autosa_hw_module *module)
{
  char *module_name = NULL;
  char *json_str = NULL;
  isl_ctx *ctx = gen->ctx;

  if (module->is_filter && module->is_buffer)
  {
    /* Parse the loop structure of the intra trans module */
    module_name = concat(ctx, module->name, "intra_trans");
    json_str = extract_loop_info_from_module(gen, module->intra_tree, module_name, module->double_buffer, module->in, 1);
    free(module_name);

    /* Parse the loop structure of the inter trans module */
    module_name = concat(ctx, module->name, "inter_trans");
    json_str = extract_loop_info_from_module(gen, module->inter_tree, module_name, module->double_buffer, module->in, 1);
    free(module_name);

    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "inter_trans_boundary");
      json_str = extract_loop_info_from_module(gen, module->boundary_inter_tree, module_name, module->double_buffer, module->in, 1);
      free(module_name);
    }
  }

  /* Parse the loop structure of the default module */
  json_str = extract_loop_info_from_module(gen, module->device_tree, module->name, module->double_buffer, module->in, 1);

  /* Parse the loop structure of the boundary module */
  if (module->boundary)
  {
    module_name = concat(ctx, module->name, "boundary");
    json_str = extract_loop_info_from_module(gen, module->boundary_tree, module_name, module->double_buffer, module->in, 1);
    free(module_name);
  }

  /* Parse the loop structure of the dummy module */
  if (module->n_pe_dummy_modules > 0)
  {
    for (int i = 0; i < module->n_pe_dummy_modules; i++)
    {
      struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[i];
      struct autosa_array_ref_group *group = dummy_module->io_group;

      /* Generate module name */
      isl_printer *p_str = isl_printer_to_str(gen->ctx);
      p_str = autosa_array_ref_group_print_prefix(group, p_str);
      p_str = isl_printer_print_str(p_str, "_PE_dummy");
      module_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);
      json_str = extract_loop_info_from_module(gen, dummy_module->device_tree, module_name, 0, 0, 1);
      free(module_name);
    }
  }

  return isl_stat_ok;
}

/* Extract the array type information that will be used for latency estimation.
 */
isl_stat sa_extract_array_info(struct autosa_kernel *kernel)
{
  cJSON *array_info = cJSON_CreateObject();
  char *json_str = NULL;
  FILE *fp;
  isl_printer *p_str;
  char *file_path;

  for (int i = 0; i < kernel->n_array; i++)
  {
    cJSON *array = cJSON_CreateObject();
    struct autosa_local_array_info *local_array = &kernel->array[i];
    char *array_name = local_array->array->name; /* Name of the array */
    char *array_type = local_array->array->type; /* Element type */

    cJSON *n_lane = cJSON_CreateNumber(local_array->n_lane);          /* Data pack factor of the array */
    cJSON *array_size = cJSON_CreateNumber(local_array->array->size); /* Element size */

    cJSON_AddItemToObject(array, "n_lane", n_lane);
    cJSON_AddStringToObject(array, "ele_type", array_type);
    cJSON_AddItemToObject(array, "ele_size", array_size);
    cJSON_AddItemToObject(array_info, array_name, array);
  }

  /* Print out the JSON */
  json_str = cJSON_Print(array_info);
  p_str = isl_printer_to_str(kernel->ctx);
  p_str = isl_printer_print_str(p_str, kernel->options->autosa->output_dir);
  p_str = isl_printer_print_str(p_str, "/latency_est/array_info.json");
  file_path = isl_printer_get_str(p_str);
  fp = fopen(file_path, "w");
  if (!fp)
  {
    printf("[AutoSA] Error: Cannot open file: %s\n", file_path);
    exit(1);
  }
  isl_printer_free(p_str);
  free(file_path);
  fprintf(fp, "%s", json_str);
  fclose(fp);
  free(json_str);
  cJSON_Delete(array_info);

  return isl_stat_ok;
}

isl_stat TP_extract_loop_info(struct autosa_gen *gen, struct autosa_hw_module *module) {
  std::vector<isl_ast_node *> asts;  
  if (module->is_filter && module->is_buffer) {
    if (module->in) {
      //std::cout << module->name << std::endl;
      //DBGASTNODE(stdout, module->tuning_device_tree, gen->ctx);
      //DBGASTNODE(stdout, module->tuning_intra_tree, gen->ctx);
      //DBGASTNODE(stdout, module->tuning_inter_tree, gen->ctx);
    }
     asts.push_back(module->tuning_device_tree);
     asts.push_back(module->tuning_intra_tree);
     asts.push_back(module->tuning_inter_tree);              
  } else {
    /* Default module */
    //if (!module->in) {
    //  std::cout << module->name << std::endl;
    //  DBGASTNODE(stdout, module->tuning_device_tree, gen->ctx);
    //}
    asts.push_back(module->tuning_device_tree);        
  }
  gen->kernel->tuning_program->extract_module_loop_info(      
      std::string(module->name), asts);
  
  return isl_stat_ok;
}

isl_stat TP_extract_module_attr(struct autosa_gen *gen, struct autosa_hw_module *module) {
  gen->kernel->tuning_program->extract_module_attr(      
      std::string(module->name), module->double_buffer, module->in, 
      (module->type == IO_MODULE || module->type == DRAIN_MODULE)? 1 : 0,
      module->to_mem, module->is_serialized, module->to_pe, module->is_filter);
  if (module->is_filter && module->is_buffer) {
    gen->kernel->tuning_program->extract_module_attr(
      std::string(module->name) + std::string("_inter"),  module->double_buffer, module->in, 
      (module->type == IO_MODULE || module->type == DRAIN_MODULE)? 1 : 0,
      module->to_mem, module->is_serialized, module->to_pe, module->is_filter);
    gen->kernel->tuning_program->extract_module_attr(
      std::string(module->name) + std::string("_intra"), module->double_buffer, module->in, 
      (module->type == IO_MODULE || module->type == DRAIN_MODULE)? 1 : 0,
      module->to_mem, module->is_serialized, module->to_pe, module->is_filter);  
  }

  return isl_stat_ok;
}

/* Extract the memory (BRAM) and computation (DSP) information that will be used for 
 * resource estimation in the auto-tuner.
 */
isl_stat TP_extract_resource_info(struct autosa_gen *gen, struct autosa_hw_module *module) {
  /* memory */
  //std::cout << module->name << ": " << module->is_buffer << std::endl;
  if ((module->type == IO_MODULE || module->type == DRAIN_MODULE) && module->is_buffer) {    
    int double_buffer = module->double_buffer;
    struct autosa_array_ref_group *group = module->io_groups[0];
    for (int i = 0; i < group->n_io_buffer; i++) {
      if (group->io_buffers[i]->tuning_tile) {    
        std::vector<isl_ast_node *> asts;
        if (module->is_filter) {
          asts.push_back(module->tuning_num_device_tree);
          asts.push_back(module->tuning_num_inter_tree);
          gen->kernel->tuning_program->extract_module_memory_info(
              std::string(module->name), double_buffer, group->io_buffers[i]->tuning_tile, asts);
        } else {
          asts.push_back(module->tuning_num_device_tree);
          gen->kernel->tuning_program->extract_module_memory_info(
              std::string(module->name), double_buffer, group->io_buffers[i]->tuning_tile, asts);
        }
      }
    }    
  } else if (module->type == PE_MODULE) {    
    //if (!((gen->kernel->options->autosa->local_reduce && gen->kernel->options->autosa->array_contraction) ||         
    //      (gen->kernel->options->autosa->tuning_method == 1 && gen->kernel->options->autosa->array_contraction))) {
      for (int i = 0; i < gen->kernel->n_array; i++) {
        struct autosa_local_array_info *array = &(gen->kernel->array[i]);
        for (int j = 0; j < array->n_pe_group; j++) {
          struct autosa_array_ref_group *group = array->pe_groups[j];
          if (group->tuning_local_tile) {
            std::vector<isl_ast_node *> asts;
            asts.push_back(module->tuning_num_device_tree);
            gen->kernel->tuning_program->extract_module_memory_info(
                std::string(module->name), 0, group->tuning_local_tile, asts);
          }
        }
      }
    //}    
  }

  /* compute */
  if (module->type == PE_MODULE) {
    std::string ele_type = std::string(module->io_groups[0]->array->type);
    gen->kernel->tuning_program->extract_module_compute_info(
        std::string(module->name), ele_type, module->tuning_num_device_tree);
  }  

  /* io */
  if ((module->type == IO_MODULE || module->type == DRAIN_MODULE)) {    
    struct autosa_array_ref_group *group = module->io_groups[0];
    std::vector<isl_ast_node *> asts;
    if (module->is_filter) {
      asts.push_back(module->tuning_num_device_tree);
      asts.push_back(module->tuning_num_inter_tree);         
    } else {
      asts.push_back(module->tuning_num_device_tree);      
    }
    gen->kernel->tuning_program->extract_module_io_info(
      std::string(module->name), module->level, asts);
  }

  return isl_stat_ok;
}

/* Extract the array references in the prog and build a mapping in the tuning program. 
 */
isl_stat TP_extract_array_info(struct autosa_gen *gen, struct autosa_kernel *kernel) {
  struct autosa_prog *prog = gen->prog;  
  isl_schedule *schedule = kernel->schedule;
  isl_schedule_node *root = isl_schedule_get_root(schedule);
  isl_union_map *umap_schedule = isl_schedule_node_get_subtree_schedule_union_map(root);
  //DBGUMAP(stdout, umap_schedule, gen->ctx);
  isl_schedule_node_free(root);

  for (int i = 0; i < prog->n_array; i++) {
    struct autosa_array_info *array = &(prog->array[i]);
    TPArray *tp_arr = new TPArray(std::string(array->name));
    assert(array->tuning_refs.size() == 0);
    array->tuning_refs.clear();
    for (int j = 0; j < array->n_ref; j++) {
      struct autosa_stmt_access *ref = array->refs[j];
      isl_map *access = ref->access;      
      /* Build the tuning program array access representation. */
      std::shared_ptr<TPArrayRef> tp_ref = kernel->tuning_program->build_array_ref(std::string(array->name), access, schedule);
     
      tp_arr->refs.push_back(std::shared_ptr<TPArrayRef>(tp_ref));      
      array->tuning_refs.push_back(std::shared_ptr<TPArrayRef>(tp_ref));
    }
    kernel->tuning_program->arrays.push_back(tp_arr);    
  }  
  isl_union_map_free(umap_schedule);

  return isl_stat_ok;
}

/* Generate a tiled array reference. */
TPArrayTile *TP_infer_tiled_array(
  struct autosa_gen *gen, struct autosa_kernel *kernel, 
  __isl_keep struct isl_schedule_node *node,
  struct autosa_array_ref_group *group,
  int read, int write)
{
  // Collect all accesses in the group
  std::vector<std::shared_ptr<TPArrayRef>> group_refs;
  for (int i = 0; i < group->n_ref; i++) {
    if (!((read && group->refs[i]->read) ||
          (write && group->refs[i]->write)))
      continue;
    group_refs.push_back(group->tuning_refs[i]);
  }

  // Collect the fixed iter dimensions
  std::vector<TPIterator *> fixed_iters;  
  isl_schedule_node *new_node = isl_schedule_node_copy(node);
  while (isl_schedule_node_has_parent(new_node)) {
    if (isl_schedule_node_get_type(new_node) == isl_schedule_node_band) {
      for (int i = 0; i < isl_schedule_node_band_n_member(new_node); i++) {
        TPIterator *iter = (TPIterator *)isl_schedule_node_band_member_get_iter(new_node, i);
        if (iter) {
          fixed_iters.push_back(iter);
        } else {
          std::cout << "not found" << std::endl;
        }
      }
    }
    new_node = isl_schedule_node_parent(new_node);
  }
  isl_schedule_node_free(new_node);  

  // Infer the tile bounds
  TPArrayTile *array_tile = new TPArrayTile();
  array_tile = kernel->tuning_program->infer_tiled_array_bounds(array_tile, group_refs, fixed_iters);  
  array_tile->name = std::string(group->array->name);
  array_tile->type = std::string(group->array->type);
  array_tile->ele_size = group->array->size;

  return array_tile;
}

/* Extract the memory type of the local array.
 * Heuristics: 
 * Compute the buffer utilization (18Kb BRAM):
 * - If the buffer port width < 18bits, util = #ele / 1024
 * - Otherwise, util = #ele / 512
 * 
 * If the local buffer is inside PE module or I/O/drain module at IO_L1:
 * - If the buffer uses primitive type (n_lane == 1) and #ele <= 32, use FF
 * - Otherwise, use BRAM
 * Otherwise:
 * - If the module is connected to DRAM, use URAM if URAM is allowed, otherwise
 *   use BRAM.
 * - Otherwise, if memory util > 0.2 use BRAM, else use LUTRAM.
 */
int extract_memory_type(struct autosa_hw_module *module,
                        struct autosa_kernel_var *var, int uram)
{
  /* 0: FF 1: LUTRAM 2: BRAM 3: URAM */
  int use_memory = 0;
  int var_size = 1;
  float bram_util;

  for (int i = 0; i < isl_vec_size(var->size); ++i)
  {
    isl_val *v = isl_vec_get_element_val(var->size, i);    
    long v_i = isl_val_get_num_si(v);
    var_size *= v_i;
    isl_val_free(v);
  }
  if (var->array->size * var->n_lane < 3)
    bram_util = (float)var_size / 1024;
  else
    bram_util = (float)var_size / 512;
  
  if (module->type != PE_MODULE && module->to_mem == 1) {
    if (uram)
      use_memory = 3;
    else
      use_memory = 2;
  } else {    
    //if (module->type == IO_MODULE && module->level == 1) {          
    //  use_memory = 1;      
    //} else {
    //  if (var->n_lane == 1 && var_size <= 8)
    //    use_memory = 0;
    //  else
    //    use_memory = 2;    
    //}    
    if (var->n_lane == 1 && var_size <= 8)
        use_memory = 0;
      else
        use_memory = 2;
  }  

  if (use_memory == 0) 
    module->use_FF = 1;

  return use_memory;
}

static cJSON *extract_buffer_info_from_module(struct autosa_gen *gen,
                                              struct autosa_hw_module *module,
                                              struct autosa_kernel_var *var, const char *suffix)
{
  cJSON *buffer = cJSON_CreateObject();

  /* Generate buffer name */
  char *buffer_name = var->name;
  if (suffix)
    buffer_name = concat(gen->ctx, buffer_name, suffix);
  cJSON_AddStringToObject(buffer, "buffer_name", buffer_name);
  if (suffix)
    free(buffer_name);

  /* Generate buffer port width */
  int n_lane = var->n_lane;
  int ele_size = var->array->size;
  int port_w = n_lane * ele_size; // in bytes
  cJSON *port_width = cJSON_CreateNumber(port_w);
  cJSON_AddItemToObject(buffer, "port_width", port_width);

  /* Generate buffer size */
  int size = 1;
  for (int j = 0; j < isl_vec_size(var->size); j++)
  {
    isl_val *v;
    int v_int;
    v = isl_vec_get_element_val(var->size, j);
    v_int = isl_val_get_num_si(v);
    isl_val_free(v);
    size *= v_int;
  }
  cJSON *buffer_size = cJSON_CreateNumber(size);
  cJSON_AddItemToObject(buffer, "buffer_depth", buffer_size);

  /* Partition number */
  cJSON *n_part = cJSON_CreateNumber(var->n_part);
  cJSON_AddItemToObject(buffer, "partition_number", n_part);

  /* Buffer memory type */
  int mem_type = extract_memory_type(module, var, gen->options->autosa->uram);
  if (mem_type == 0)
    cJSON_AddStringToObject(buffer, "mem_type", "FF");
  else if (mem_type == 1)
    cJSON_AddStringToObject(buffer, "mem_type", "LUTRAM");
  else if (mem_type == 2)
    cJSON_AddStringToObject(buffer, "mem_type", "BRAM");
  else
    cJSON_AddStringToObject(buffer, "mem_type", "URAM");

  ///* Array map */
  //if (module->double_buffer) {
  //  cJSON_AddStringToObject(buffer, "array_map", "horizontal");
  //}

  return buffer;
}

/* If "buffer" is set 1, extract local buffer information. */
static cJSON *extract_design_info_from_module(struct autosa_gen *gen,
                                              struct autosa_hw_module *module, char *module_name, int buffer)
{
  cJSON *info = cJSON_CreateObject();
  int double_buffer = module->double_buffer;

  if (module->type == PE_MODULE)
  {
    /* Extract the SIMD factor */
    cJSON *unroll = cJSON_CreateNumber(gen->kernel->simd_w);
    cJSON_AddItemToObject(info, "unroll", unroll);
    cJSON *lat_hide_len = cJSON_CreateNumber(gen->kernel->lat_hide_len);
    cJSON_AddItemToObject(info, "latency_hide_len", lat_hide_len);

    int *fifo_lanes_num = (int *)malloc(module->n_io_group * sizeof(int));
    for (int i = 0; i < module->n_io_group; i++)
      fifo_lanes_num[i] = module->io_groups[i]->n_lane;
    cJSON *fifo_lanes = cJSON_CreateIntArray(fifo_lanes_num, module->n_io_group);
    cJSON_AddItemToObject(info, "fifo_lanes", fifo_lanes);
    free(fifo_lanes_num);
  }
  else
  {
    /* Extract the input and output data lanes and width */
    cJSON *data_pack_inter = cJSON_CreateNumber(module->data_pack_inter);
    cJSON *data_pack_intra = cJSON_CreateNumber(module->data_pack_intra);
    cJSON_AddItemToObject(info, "data_pack_inter", data_pack_inter);
    cJSON_AddItemToObject(info, "data_pack_intra", data_pack_intra);

    struct autosa_array_ref_group *group = module->io_groups[0];
    struct autosa_array_info *array = group->array;
    cJSON_AddStringToObject(info, "ele_type", array->type);
    cJSON *data_size = cJSON_CreateNumber(array->size);
    cJSON_AddItemToObject(info, "ele_size", data_size);

    /* Mark the module accessing the DRAM */
    if (module->to_mem) {
      cJSON_AddNumberToObject(info, "access_mem", 1);
    } else {
      cJSON_AddNumberToObject(info, "access_mem", 0);
    }
  }
  /* Extract the local buffer */
  if (buffer)
  {
    cJSON *buffers = cJSON_CreateArray();
    for (int i = 0; i < module->n_var; ++i)
    {
      cJSON *buffer = NULL;
      struct autosa_kernel_var *var = &module->var[i];
      if (double_buffer)
      {
        buffer = extract_buffer_info_from_module(gen, module, var, "ping");
        cJSON_AddItemToArray(buffers, buffer);
        buffer = extract_buffer_info_from_module(gen, module, var, "pong");
        cJSON_AddItemToArray(buffers, buffer);
      }
      else
      {
        buffer = extract_buffer_info_from_module(gen, module, var, NULL);
        cJSON_AddItemToArray(buffers, buffer);
      }
    }
    cJSON_AddItemToObject(info, "local_buffers", buffers);
  }

  return info;
}

static cJSON *extract_design_info_from_serialize_module(struct autosa_gen *gen,
                                                        struct autosa_hw_module *module, char *module_name)
{
  cJSON *info = cJSON_CreateObject();
  /* Extract the input and output data lanes and width */
  cJSON *data_pack_inter = cJSON_CreateNumber(module->data_pack_serialize);
  cJSON *data_pack_intra = cJSON_CreateNumber(module->data_pack_intra);
  cJSON_AddItemToObject(info, "data_pack_inter", data_pack_inter);
  cJSON_AddItemToObject(info, "data_pack_intra", data_pack_intra);

  struct autosa_array_ref_group *group = module->io_groups[0];
  struct autosa_array_info *array = group->array;
  cJSON_AddStringToObject(info, "ele_type", array->type);
  cJSON *data_size = cJSON_CreateNumber(array->size);
  cJSON_AddItemToObject(info, "ele_size", data_size);

  return info;
}

/* Extract the data packing factor "n_lane" for PE dummy module.
 * Note that for PE dummay module with internal array, if the I/O type is 
 * interior I/O, we look for the n_lane of IO_L1 buffer.
 */
static cJSON *extract_design_info_from_pe_dummy_module(struct autosa_gen *gen,
                                                       struct autosa_pe_dummy_module *module, char *module_name)
{
  cJSON *info = cJSON_CreateObject();
  struct autosa_array_ref_group *group = module->io_group;
  int n_lane = (group->local_array->array_type == AUTOSA_EXT_ARRAY) ? group->n_lane : ((group->group_type == AUTOSA_DRAIN_GROUP) ? group->n_lane : (group->io_type == AUTOSA_EXT_IO) ? group->n_lane : group->io_buffers[0]->n_lane);
  cJSON *data_pack = cJSON_CreateNumber(n_lane);
  cJSON_AddItemToObject(info, "unroll", data_pack);

  return info;
}

/* Exatract the design information into a JSON struct for resource estimation.
 * If the module contains buffers, extract the buffer information.
 * For I/O modules, extract:
 * - input and output data lanes and width
 * For PE modules, extract:
 * - simd factor if any
 */
isl_stat sa_extract_design_info(struct autosa_gen *gen)
{
  cJSON *design_info = cJSON_CreateObject();
  char *json_str = NULL;
  FILE *fp;
  struct autosa_hw_top_module *top = gen->hw_top_module;
  isl_ctx *ctx = gen->ctx;
  isl_printer *p_str;
  char *file_path;

  /* kernel id */
  //DBGVAR(std::cout, gen->kernel->id);
  cJSON *kernel_id = cJSON_CreateNumber(gen->kernel->id);
  cJSON_AddItemToObject(design_info, "kernel_id", kernel_id);

  /* module */
  cJSON *modules = cJSON_CreateObject();
  cJSON_AddItemToObject(design_info, "modules", modules);
  for (int i = 0; i < gen->n_hw_modules; i++)
  {
    struct autosa_hw_module *module = gen->hw_modules[i];
    char *module_name;
    cJSON *info;

    if (module->is_filter && module->is_buffer)
    {
      /* intra_trans */
      module_name = concat(ctx, module->name, "intra_trans");
      info = extract_design_info_from_module(gen, module, module_name, 0);
      cJSON_AddItemToObject(modules, module_name, info);
      free(module_name);

      /* inter_trans */
      module_name = concat(ctx, module->name, "inter_trans");
      info = extract_design_info_from_module(gen, module, module_name, 0);
      cJSON_AddItemToObject(modules, module_name, info);
      free(module_name);

      if (module->boundary)
      {
        module_name = concat(ctx, module->name, "inter_trans_boundary");
        info = extract_design_info_from_module(gen, module, module_name, 0);
        cJSON_AddItemToObject(modules, module_name, info);
        free(module_name);
      }
    }

    /* default module */
    info = extract_design_info_from_module(gen, module, module_name, 1);
    cJSON_AddItemToObject(modules, module->name, info);

    /* boundary module */
    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "boundary");
      info = extract_design_info_from_module(gen, module, module_name, 1);
      cJSON_AddItemToObject(modules, module_name, info);
      free(module_name);
    }

    if (module->n_pe_dummy_modules > 0)
    {
      for (int i = 0; i < module->n_pe_dummy_modules; i++)
      {
        struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[i];
        struct autosa_array_ref_group *group = dummy_module->io_group;
        char *module_name;
        /* Generate module name */
        isl_printer *p_str = isl_printer_to_str(ctx);
        p_str = isl_printer_print_str(p_str, group->array->name);
        if (group->group_type == AUTOSA_IO_GROUP)
        {
          if (group->local_array->n_io_group > 1)
          {
            p_str = isl_printer_print_str(p_str, "_");
            p_str = isl_printer_print_int(p_str, group->nr);
          }
        }
        else if (group->group_type == AUTOSA_DRAIN_GROUP)
        {
          p_str = isl_printer_print_str(p_str, "_");
          p_str = isl_printer_print_str(p_str, "drain");
        }
        p_str = isl_printer_print_str(p_str, "_PE_dummy");
        if (dummy_module->in) 
          p_str = isl_printer_print_str(p_str, "_in");
        else
          p_str = isl_printer_print_str(p_str, "_out");
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);
        info = extract_design_info_from_pe_dummy_module(gen, dummy_module, module_name);
        cJSON_AddItemToObject(modules, module_name, info);
        free(module_name);
      }
    }

    if (module->is_serialized) {
      if (module->boundary)
        module_name = concat(ctx, module->name, "boundary_serialize");
      else
        module_name = concat(ctx, module->name, "serialize");
      info = extract_design_info_from_serialize_module(gen, module, module_name);
      cJSON_AddItemToObject(modules, module_name, info);
      free(module_name);
    }
  }

  json_str = cJSON_Print(design_info);
  p_str = isl_printer_to_str(gen->ctx);
  p_str = isl_printer_print_str(p_str, gen->options->autosa->output_dir);
  p_str = isl_printer_print_str(p_str, "/resource_est/design_info.json");
  file_path = isl_printer_get_str(p_str);
  fp = fopen(file_path, "w");
  if (!fp)
  {
    printf("[AutoSA] Error: Cannot open file: %s\n", file_path);
  }
  fprintf(fp, "%s", json_str);
  fclose(fp);
  free(file_path);
  isl_printer_free(p_str);
  cJSON_Delete(design_info);
  free(json_str);

  return isl_stat_ok;
}

/* The sparse info is provided in the format of 
 * kernel[]->block_sparse[n_non_zero_num, vec_len]
 * Extract these information and compute the extra meta i nformation.
 */
isl_stat autosa_kernel_extract_sparse_info(struct autosa_kernel *kernel, 
  struct autosa_gen *gen)
{
  isl_union_map *sparse_info;
  isl_set *size;
  int *ratios;
  int array_size;

  ratios = isl_alloc_array(kernel->ctx, int, 2);
  if (!ratios) {
    return isl_stat_error;
  }

  sparse_info = extract_sizes_from_str(kernel->ctx, gen->options->autosa->block_sparse_ratio);
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    isl_set *tmp_size;    
    tmp_size = extract_sa_sizes(sparse_info, local_array->array->name);
    if (tmp_size) {
      local_array->is_sparse = 1;
      size = tmp_size;    
    } else {
      isl_set_free(tmp_size);
    }
  }
  isl_union_map_free(sparse_info);

  if (isl_set_dim(size, isl_dim_set) < 2) {
    isl_set_free(size);
    free(ratios);    
    return isl_stat_error;
  }

  if (read_sa_sizes_from_set(size, ratios, 2) < 0) 
    goto error;

  kernel->sparse = 1;
  kernel->vec_len = ratios[1];
  kernel->n_nzero = ratios[0];
  free(ratios);  
  kernel->compress_ratio = (float)kernel->vec_len / kernel->n_nzero;
  /* Get the data type, we assume that all arrays are in the same precisions. */
  array_size = -1; // in bytes
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (array_size == -1)
      array_size = local_array->array->size;
    else {
      if (array_size != local_array->array->size) {
        throw std::runtime_error("[AutoSA] Error: Arrays with different data types are not supported for the block sparsity.");
      }
    }
  }
  /* Currently we only support vec_len no greater than 8. */
  if (kernel->vec_len > 8) {
    throw std::runtime_error("[AutoSA] Error: Block size greater than 8 is not supported for the block sparsity.");
  }

  /* For Xilinx HLS, data needs to be aligned with 32/64/128/256/512-bit boundary. */
  if (array_size * kernel->n_nzero * 8 + 8 <= 32) {
    kernel->n_meta_data = (32 / 8 - array_size * kernel->n_nzero) / array_size;
  } else if (array_size * kernel->n_nzero * 8 + 8 <= 64) {
    kernel->n_meta_data = (64 / 8 - array_size * kernel->n_nzero) / array_size;
  } else if (array_size * kernel->n_nzero * 8 + 8 <= 128) {
    kernel->n_meta_data = (128 / 8 - array_size * kernel->n_nzero) / array_size;
  } else if (array_size * kernel->n_nzero * 8 + 8 <= 256) {
    kernel->n_meta_data = (256 / 8 - array_size * kernel->n_nzero) / array_size;
  } else if (array_size * kernel->n_nzero * 8 + 8 <= 512) {
    kernel->n_meta_data = (512 / 8 - array_size * kernel->n_nzero) / array_size;
  } else {
    throw std::runtime_error("[AutoSA] Error: The requested aligned sparse data is longer than 512-bit.");
  }
  kernel->eff_compress_ratio = (float)kernel->vec_len / (kernel->n_nzero + kernel->n_meta_data);    
  /* Update the local array */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (local_array->is_sparse) {
      local_array->vec_len = kernel->vec_len;
      local_array->n_nzero = kernel->n_nzero;
      local_array->compress_ratio = kernel->compress_ratio;
      local_array->n_meta_data = kernel->n_meta_data;
      local_array->eff_compress_ratio = kernel->eff_compress_ratio;
    }
  }

  return isl_stat_ok;
error:    
  free(ratios);
  return isl_stat_error;
}

================================================
FILE: src/autosa_common.h
================================================
#ifndef _AUTOSA_COMMON_H
#define _AUTOSA_COMMON_H

#include <assert.h>
#include <limits.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <utility>
#include <stdexcept>

#include <isl/aff.h>
#include <isl/aff_type.h>
#include <isl/id.h>
#include <isl/ctx.h>
#include <isl/flow.h>
#include <isl/map.h>
#include <isl/map_type.h>
#include <isl/space.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <isl/val.h>
#include <isl/polynomial.h>

#include <cJSON/cJSON.h>

#include "ppcg.h"
#include "schedule.h"
#include "util.h"
#include "autosa_tuning.h"

#ifdef _DEBUG
#define D(x) x
#else
#define D(x)
#endif

#if defined(__cplusplus)
extern "C" {
#endif  

//#define min(a, b) (((a) < (b)) ? (a) : (b))
//#define max(a, b) (((a) > (b)) ? (a) : (b))

/* If enabled, use the default ISL sink API. */
//#define ISL_SINK
/* If enabled, the loop tiling factors should be reversed as well. 
 * The tiled point loops will have a reverse order compared to the original loops.
 */
//#define REVERSE_ORDER

enum autosa_group_access_type
{
  AUTOSA_ACCESS_GLOBAL,
  AUTOSA_ACCESS_LOCAL,
  AUTOSA_ACCESS_SHARED,
  AUTOSA_ACCESS_PRIVATE
};

enum autosa_kernel_stmt_type
{
  AUTOSA_KERNEL_STMT_COPY,
  AUTOSA_KERNEL_STMT_DOMAIN,
  AUTOSA_KERNEL_STMT_SYNC,
  AUTOSA_KERNEL_STMT_IO,
  AUTOSA_KERNEL_STMT_IO_TRANSFER,
  AUTOSA_KERNEL_STMT_IO_TRANSFER_BUF,
  AUTOSA_KERNEL_STMT_IO_DRAM,
  AUTOSA_KERNEL_STMT_FIFO_DECL,
  AUTOSA_KERNEL_STMT_MODULE_CALL,
  AUTOSA_KERNEL_STMT_EXT_MODULE,
  AUTOSA_KERNEL_STMT_DRAIN_MERGE,
  AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS,
  AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS,
  AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA,
  AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER,
  AUTOSA_KERNEL_STMT_IO_MODULE_CALL_STATE_HANDLE,
  AUTOSA_KERNEL_STMT_HOST_SERIALIZE
};

enum autosa_dep_type
{
  AUTOSA_DEP_RAW,
  AUTOSA_DEP_RAR,
  AUTOSA_DEP_WAR,
  AUTOSA_DEP_WAW,
  AUTOSA_DEP_UNKNOWN
};

enum autosa_io_type
{
  AUTOSA_INT_IO,
  AUTOSA_EXT_IO,
  AUTOSA_UNKNOWN_IO
};

enum autosa_io_dir
{
  IO_IN,
  IO_OUT,
  IO_INOUT,
  IO_NULL,
  IO_UNKNOWN
};

enum autosa_module_type
{
  PE_MODULE,
  IO_MODULE,
  DRAIN_MODULE
};

enum autosa_group_type
{
  AUTOSA_IO_GROUP,
  AUTOSA_PE_GROUP,
  AUTOSA_DRAIN_GROUP,
  AUTOSA_UNKNOWN_GROUP
};

enum autosa_array_type
{
  AUTOSA_EXT_ARRAY,
  AUTOSA_INT_ARRAY,
  AUTOSA_UNKNOWN_ARRAY
};

enum platform
{
  INTEL_HW,
  XILINX_HW,
  CATAPULT_HW,
  TAPA_HW
};

struct autosa_dep
{
  isl_id *src;
  isl_id *dest;
  isl_vec *disvec;
  enum autosa_dep_type type;
  isl_basic_map *isl_dep;

  /* Iteration domain in scheduling dimensions. */
  isl_set *src_sched_domain;
  isl_set *dest_sched_domain;
};

/* A sequence of "n" names of types.
 */
struct autosa_types
{
  int n;
  char **name;
};

struct autosa_iter
{
  char *name;
  isl_aff *lb;
  isl_aff *ub;
  int stride;
  char *ts_name;
};

/* Representation of a local variable in a kernel 
 */
struct autosa_kernel_var
{
  struct autosa_array_info *array;
  enum autosa_group_access_type type;
  char *name;
  isl_vec *size;
  /* Data packing factors */
  int n_lane;
  /* Array partition factors */
  int n_part;
  /* Needs initialize */
  int init_required;
};

struct autosa_kernel
{
  isl_ctx *ctx;
  isl_schedule *schedule;
  struct ppcg_scop *scop;
  struct autosa_prog *prog;
  struct ppcg_options *options;

  int n_sa_dim;
  int sa_dim[3];
  int space_parallel[3];
  int space_time_id;
  int array_part_w;
  int space_w;
  int time_w;
  int simd_w;
  int lat_hide_len;

  int type; // AUTOSA_SA_TYPE_ASYNC | AUTOSA_SA_TYPE_SYNC

  isl_multi_pw_aff *sa_grid_size;
  /* User specified (array_part/latency_hiding/simd) sizes for each kernel. */
  isl_union_map *sizes;
  /* Effectively used (array_part/latency_hiding/simd) sizes for each kernel. */
  isl_union_map *used_sizes;

  /* Identifier of the kernel. */
  int id;
  /* The spaces of the statement domains that form the core computation of the 
   * kernel. 
   */
  isl_union_set *core;
  /* The set of possibly accessed outer array elements. */
  isl_union_set *arrays;
  /* "n_array" is the total number of arrays in the input program and also
   * the number of elements in the "array".
   * "array" contains information about each array that is local to the current
   * kernel. If an array is not used in a kernel, then the corresponding 
   * entry does not contain any information.
   */
  int n_array;
  struct autosa_local_array_info *array;

  /* "copy_schdule" corresponds to the schedule dimensions of the 
   * (tiled) schedule for this kernel that have been taken into account
   * for computing private/shared memory tiles.
   * copy_schedule_dim is the dimension of this schedule. 
   */
  isl_union_pw_multi_aff *copy_schedule;
  int copy_schedule_dim;

  /* "space" is the schedule space of the AST context. That is, it represents
   * the loops of the generated host code containing the kernel launch. 
   */
  isl_space *space;
  isl_ast_node *tree;

  /* Local variables in a kernel. */
  int n_var;
  struct autosa_kernel_var *var;

  /* Contains the list of block identifiers for this kernel. */
  isl_id_list *block_ids;
  /* Contains the list of thread identifiers for this kernel. */
  isl_id_list *thread_ids;
  /* Contains the list of PE identifers for this kernel. */
  isl_id_list *pe_ids;

  /* Contains constraints on the domain elements in the kernel
   * that encode the mapping to PE identifiers, where the PE identifiers
   * are represented by "space_w" parameters with the names as the elements
   * of "pe_ids".
   */
  isl_union_set *pe_filter;

  /* The first n_grid elements of grid_dim represent the specified size of 
   * the grid.
   * The first n_block elements of block_dim represent the specified or 
   * effective size of tghe block.
   * Note that in the input file, the sizes of the grid and the blocks 
   * are specified in the order x, y, z, but internally, the sizes 
   * are stored in reverse order, so that the last elments always referes
   * to the x dimension.
   *
   * grid_size reflects the effective grid size.
   * grid_size_expr contains a corresponding access AST expression, built within
   * the context where the launch appears.
   */
  int n_grid;
  int n_block;
  int grid_dim[2];
  int block_dim[3];

  isl_multi_pw_aff *grid_size;
  isl_ast_expr *grid_size_expr;

  /* Contains the values of the parameters and outer schedule dimensions
   * for which any statement instance in this kernel needs to be executed.
   */
  isl_set *context;

  /* Contraction maps those original statement instances to the statement
   * instances that are active at the point in the schedule tree where 
   * the kernel is created.
   */
  isl_union_pw_multi_aff *contraction;
  /* Contains the original statement instances,
   * i.e., those that appear in the domains of access relations, 
   * that are involved in the kernel. 
   */
  isl_union_set *expanded_domain;
  isl_union_set *domain;

  isl_set *host_domain;
  int single_statement;  

  /* Data structures for block sparsity.
   * vec_len is the vector length of each sparse block.
   * n_nzero is the number of non-zero elements in the block.
   * compress_ratio is calculated as vec_len / n_nzero.
   * Each sparse block is stored as [data, data, offset]
   * The offset is a 8-bit long unsigned char that stores a mask that 
   * indicating the the position of non-zero elements.
   * This block is also padded to align with 32/128/256/512-bit boundary 
   * as required by Xilinx HLS.
   * n_meta_data stores the size of the padded elements plus the offset together 
   * counted in terms of the size of the data elements. 
   * effective_compress_ratio is calculated as vec_len / (n_nzero + n_meta_data).
   */
  int sparse;
  int vec_len;
  int n_nzero;
  float compress_ratio;
  int n_meta_data;
  float eff_compress_ratio;

  /* Tuning program */
  TuningProgram *tuning_program;
};

struct autosa_io_info
{
  enum autosa_io_type io_type;
  struct autosa_dep *dep;
  isl_vec *dir;
  /* Old data transfer direction before interior I/O elimination */
  isl_vec *old_dir;  
};

/* An access to an outer array element or an iterator.
 * Accesses to iterators have an access relation that maps to an unnamed space.
 * An access may be both read and write.
 * If the access relation is empty, then the output dimension may
 * not be equal to the dimension of the corresponding array.
 */
struct autosa_stmt_access
{
  /* Access reads elements */
  int read;
  /* Access writes elements */
  int write;
  /* All writes are definite writes. */
  int exact_write;
  /* Is a single, fixed element being accessed? */
  isl_bool fixed_element;
  /* The number of index expressions specified in the access. */
  int n_index;

  /* May access relation */
  isl_map *access;
  /* May access relation with as domain a mapping from iteration domain
	 * to a reference identifier.
	 */
  isl_map *tagged_access;
  /* The reference id of the corresponding pet_expr. */
  isl_id *ref_id;

  /* AutoSA extended */
  struct autosa_io_info **io_info;
  int n_io_info;
  /* Indicates if layout transformation is required for SIMD */
  int layout_trans;
  /* Indicates which array dimension should be permuted innmermost for SIMD */
  int simd_dim;
  /* Indicates the stride pattern under the SIMD loop.
   * Default value as -1. 0 if stride-0 and 1 if stride-1 */
  int simd_stride;
  /* AutoSA extended */

  struct autosa_stmt_access *next;
};

/* Internal data structure for extract_access.
 * "next_access" points to the end of a linked list that is extended
 * by extract_access.
 * "single_expression" is set if the access expressions belong to
 * an expression statement (i.e., a statement without internal control).
 * "any_to_outer" maps all intermediate arrays to their outer arrays.
 */
struct ppcg_extract_access_data
{
  struct autosa_stmt_access **next_access;
  int single_expression;
  isl_union_map *any_to_outer;
};

/* A representation of a user statement.
 * "stmt" points to the corresponding pet statement.
 * "id" is the identifier of the instance set of the statement.
 * "accesses" is a linked list of accesses performed by the statement.
 * If the statement has been killed, i.e., if it will not be scheduled,
 * then this linked list may be empty even if the actual statement does
 * perform accesses.
 */
struct autosa_stmt
{
  isl_id *id;
  struct pet_stmt *stmt;

  struct autosa_stmt_access *accesses;
};

/* Represents an outer array possibly accessed by a autosa_prog.
 */
struct autosa_array_info
{
  /* The array data space. */
  isl_space *space;
  /* Element type. */
  char *type;
  /* Element size. */
  int size;
  /* Name of the array. */
  char *name;
  /* Declared extent of original array. */
  isl_set *declared_extent;
  /* AST expression for declared size of original array. */
  isl_ast_expr *declared_size;
  /* Extent of the array that needs to be copied. */
  isl_set *extent;
  /* Number of indices. */
  unsigned n_index;
  /* For each index, a bound on "extent" in that direction. */
  isl_multi_pw_aff *bound;
  /* The corresponding access AST expression, if the array needs
	 * to be allocated on the device.
	 */
  isl_ast_expr *bound_expr;

  /* All references to this array; point to elements of a linked list. */
  int n_ref;
  struct autosa_stmt_access **refs;

  /* Is this array accessed at all by the program? */
  int accessed;

  /* Is this a scalar that is read-only within the entire program? */
  int read_only_scalar;

  /* Are the elements of the array structures? */
  int has_compound_element;

  /* Are the elements only accessed through constant index expressions? */
  int only_fixed_element;

  /* Is the array local to the scop? */
  int local;
  /* Is the array local and should it be declared on the host? */
  int declare_local;

  /* Is the corresponding global device memory accessed in any way? */
  int global;

  /* Should the array be linearized? */
  int linearize;

  /* Order dependences on this array.
	 * Only used if live_range_reordering option is set.
	 * It is set to NULL otherwise.
	 */
  isl_union_map *dep_order;

  /* AutoSA Extended */
  int n_lane;
  /* Since in AutoSA, we only a single kernel, 
   * the "local_array" is safely pointed to the local array inside the kernel.
   */
  struct autosa_local_array_info *local_array;
  /* Is the array to be copied in to the device memory? */
  int copy_in;
  /* Is the array to be copied out from the device memory? */
  int copy_out;
  /* Tuning array refs */
  std::vector<std::shared_ptr<TPArrayRef>> tuning_refs;
  /* AutoSA Extended */
};

struct autosa_io_buffer
{
  /* The local buffer tile, NULL if none. */
  struct autosa_array_tile *tile;
  /* The buffer is located at io_L"level". */
  int level;
  /* The data packing factor */
  int n_lane;
  /* Is the buffer data serialzied at the host size. */
  int serialize;
  /* Is the buffer data sparse */
  int sparse;
  int vec_len;
  /* Tuning array tile */
  TPArrayTile *tuning_tile;
  /* Used for hoisting buffer */
  int hoist_depth;
  isl_union_set *hoist_domain;
};

/* A group of array references in a kernel that should be handled together. 
 */
struct autosa_array_ref_group
{
  /* The references in this group access this local array. */
  struct autosa_local_array_info *local_array;
  /* This is the corresponding array. */
  struct autosa_array_info *array;
  /* Position of this group in the list of reference group of array. */
  int nr;

  /* The following fields are use during the construction of the groups.
	 * access is the combined access relation relative to the private
	 * memory tiling.  In particular, the domain of the map corresponds
	 * to the first thread_depth dimensions of the kernel schedule.
	 * write is set if any access in the group is a write.
	 * exact_write is set if all writes are definite writes.
	 * slice is set if there is at least one access in the group
	 * that refers to more than one element
	 * "min_depth" is the minimum of the tile depths and thread_depth.
	 */
  isl_map *access;
  int write;
  int exact_write;
  int slice;
  int min_depth;

  /* The shared memory tile, NULL if none. */
  struct autosa_array_tile *shared_tile;

  /* The private memory tile, NULL if none. */
  struct autosa_array_tile *private_tile;

  /* The local memory tile, NULL if none. */
  struct autosa_array_tile *local_tile;

  /* References in this group; point to elements of a linked list. */
  int n_ref;
  struct autosa_stmt_access **refs;

  /* AutoSA Extended */
  /* The local memory tile inside PEs. This is for internal array with interior I/O */
  struct autosa_array_tile *pe_tile;
  /* I/O buffers inserted at each IO level */
  struct autosa_io_buffer **io_buffers;
  int n_io_buffer;
  /* I/O type: interior/exterior I/O */
  enum autosa_io_type io_type;
  /* I/O direction at the PE level (after interior I/O elimination) */
  isl_vec *dir;
  /* I/O direction at the PE level (before interior I/O elimination) */
  isl_vec *old_dir;
  /* Group type: I/O/drain/PE group */
  enum autosa_group_type group_type;
  /* I/O direction at the PE level */
  enum autosa_io_dir pe_io_dir;
  /* I/O direction at the array level */
  enum autosa_io_dir array_io_dir;
  /* Maps PE identifiers to I/O identifiers */
  isl_multi_aff *io_trans;    /* pe ids -> io ids */
  isl_multi_aff *io_L1_trans; /* pe ids -> L1 io ids */
  /* AST expression maps L1 I/O identifiers to PE identifiers */
  isl_ast_expr *io_pe_expr;    /* io ids -> pe ids */
  isl_ast_expr *io_L1_pe_expr; /* L1 io ids -> pe ids */
  isl_ast_expr *io_pe_expr_boundary;
  isl_ast_expr *io_L1_pe_expr_boundary;
  /* I/O schedule */
  isl_schedule *io_schedule;
  isl_schedule *io_L1_schedule;
  isl_schedule *io_L1_lower_schedule;
  /* Number of I/O levels */
  int io_level;
  /* Dims of space band */
  int space_dim;
  /* Data pack factor inside PEs */
  int n_lane;
  /* Copy schedule for PE group */
  int copy_schedule_dim;
  isl_union_pw_multi_aff *copy_schedule;
  /* Number of DRAM ports that this group is connected. */
  int n_mem_ports;
  /* The starting offset of external memory port id for this group. */
  int mem_port_id;
  /* Does copy-in module exist? */
  int copy_in;
  /* Does copy-out module exist? */
  int copy_out;
  /* Attached drain group */
  struct autosa_array_ref_group *attached_drain_group;  
  /* Tuning array refs */
  std::vector<std::shared_ptr<TPArrayRef>> tuning_refs;
  TPArrayTile *tuning_pe_tile;
  TPArrayTile *tuning_local_tile;
  /* AutoSA Extended */
};

struct autosa_array_ref_group_pair
{
  struct autosa_array_ref_group *local_group;
  struct autosa_array_ref_group *io_group;
  struct autosa_array_tile *local_tile; /* Compute the local tile */
  int in_use;
  isl_map *tagged_access;
  int simd_depth;
};

/* Represents an outer array accessed by a autosa_kernel, localized
 * to the context of this kernel.
 *
 * "array" points to the corresponding array in the autosa_prog.
 * The "n_group" "groups" are the reference groups associated to the array.
 * If "force_private" is set, then the array (in practice a scalar)
 * must be mapped to a register.
 * "global" is set if the global device memory corresponding
 * to this array is accessed by the kernel.
 * "bound" is equal to array->bound specialized to the current kernel.
 * "bound_expr" is the corresponding access AST expression.
 */
struct autosa_local_array_info
{
  struct autosa_array_info *array;

  /* PE groups */
  int n_pe_group;
  struct autosa_array_ref_group **pe_groups;

  /* IO groups */
  int n_io_group;
  struct autosa_array_ref_group **io_groups;

  /* Drain groups */
  struct autosa_array_ref_group *drain_group;

  /* Number of different I/O modules that access the array.
   * Due to the limitation of Xilinx HLS, we will need to 
   * allocate separater pointers for each group. 
   */
  int n_io_group_refs;
  /* Number of external memory ports that this array is allocated. */
  int n_mem_ports;
  /* Map from io_group_ref to mem_port. */  
  std::vector<int> group_ref_mem_port_map;  

  /* Default groups */
  int n_group;
  struct autosa_array_ref_group **groups;

  /* Is array serialized at the host side. */
  int host_serialize;
  isl_pw_qpolynomial *serialize_bound;

  enum autosa_array_type array_type;
  int n_lane;

  int force_private;
  int global;

  unsigned n_index;
  isl_multi_pw_aff *bound;
  isl_ast_expr *bound_expr;

  /* Is this the sparse matrix in the block sparsity */
  int is_sparse;
  int vec_len;
  int n_nzero;
  float compress_ratio;
  int n_meta_data;
  float eff_compress_ratio;
};

/* "read" and "write" contain the original access relations, possibly 
 * involving member accesses.
 * 
 * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
 * only refer to the outer arrays of any possible member accesses.
 */
struct autosa_prog
{
  isl_ctx *ctx;

  struct ppcg_scop *scop;

  /* Set of parameter values */
  isl_set *context;

  /* All potential read accesses in the entire program */
  isl_union_map *read;

  /* All potential write accesses in the entire program */
  isl_union_map *may_write;
  /* All definite write accesses in the entire program */
  isl_union_map *must_write;
  /* All tagged definite kills in the entire program */
  isl_union_map *tagged_must_kill;

  /* The set of inner array elements that may be preserved. */
  isl_union_set *may_persist;

  /* A mapping from all innermost arrays to their outer arrays. */
  isl_union_map *to_outer;
  /* A mapping from all the outer arrays to all corresponding inner arrays */
  isl_union_map *to_inner;
  /* A mapping from all intermediate arrays to their outer arrays,
	 * including an identity mapping from the anonymous 1D space to itself.
	 */
  isl_union_map *any_to_outer;

  /* Order dependences on non-scalars. */
  isl_union_map *array_order;

  /* Array of statements */
  int n_stmts;
  struct autosa_stmt *stmts;

  int n_array;
  struct autosa_array_info *array;  
};

struct autosa_hw_top_module
{
  int n_fifo_decls;
  int n_module_calls;
  isl_schedule **fifo_decl_scheds;
  isl_schedule **module_call_scheds;
  isl_ast_node **fifo_decl_trees;
  isl_ast_node **module_call_trees;
  char **fifo_decl_names;

  /* Wrapped AST */
  int n_fifo_decl_wrapped;
  int n_module_call_wrapped;
  isl_ast_node **fifo_decl_wrapped_trees;
  isl_ast_node **module_call_wrapped_trees;

  int n_hw_modules;
  struct autosa_hw_module **hw_modules;
  struct autosa_kernel *kernel;

  /* For Intel devices */
  int n_ext_module;
  isl_schedule **ext_module_scheds;
  isl_ast_node **ext_module_trees;
  int n_ext_module_wrapped;
  isl_ast_node **ext_module_wrapped_trees;
};

struct autosa_pe_dummy_module
{
  struct autosa_hw_module *module;
  struct autosa_array_ref_group *io_group;
  isl_schedule *sched;
  isl_ast_node *tree;
  isl_ast_node *device_tree;
  int in;
};

struct autosa_drain_merge_func
{
  struct autosa_array_ref_group *group;
  struct autosa_kernel *kernel;
  isl_id_list *inst_ids;
  isl_schedule *sched;
  isl_ast_node *tree;
  isl_ast_node *device_tree;
};

struct autosa_hw_module
{
  struct ppcg_options *options;

  enum autosa_module_type type;
  /* Module name */
  char *name;

  isl_id_list *inst_ids;
  int n_var;
  struct autosa_kernel_var *var;

  /* Module function schedule */
  isl_schedule *sched;

  /* Module function AST */
  isl_ast_node *tree;
  isl_ast_node *device_tree;

  /* Array reference group for I/O or drain module */
  struct autosa_array_ref_group **io_groups;
  int n_io_group;

  /* I/O module level */
  int level;
  /* I/O module copy-in/out */
  int in;
  /* Connect to external memory */
  int to_mem;
  /* Connect to PE */
  int to_pe;
  /* Contains buffer */
  int is_buffer;
  /* Filter module */
  int is_filter;
  /* Is the DRAM data serialized */
  int is_serialized;

  /* Serialization schedule */
  isl_schedule *serialize_sched;
  isl_ast_node *serialize_tree;

  /* Module function schedule for buffer_filter modules */
  isl_schedule *outer_sched; /* Outer loops */
  isl_schedule *inter_sched; /* Inter transfer */
  isl_schedule *intra_sched; /* Intra transfer */

  isl_schedule *boundary_outer_sched; /* Outer loops in boundary module */
  isl_schedule *boundary_inter_sched; /* Inter transfer in boundary module */

  isl_space *inter_space;
  isl_space *intra_space;
  isl_space *space;

  isl_ast_node *inter_tree;
  isl_ast_node *intra_tree;

  isl_ast_node *boundary_outer_tree;
  isl_ast_node *boundary_inter_tree;

  /* Module function schedule for filter modules at the boundary */
  isl_schedule *boundary_sched;
  isl_ast_node *boundary_tree;
  int boundary;

  /* Dummy modules for collecting data at boundary PEs */
  int n_pe_dummy_modules;
  struct autosa_pe_dummy_module **pe_dummy_modules;

  int double_buffer;

  /* Generate credit control */
  int credit;

  /* Data pack factor */
  int data_pack_inter;
  int data_pack_intra;
  int data_pack_serialize;

  /* For I/O module, local array ref index */  
  int n_array_ref;

  /* Coalesce bound
   * Used for I/O module that connects to the DRAM. 
   * Indicates the loop extent of the memory coalesce loop.
   */
  int coalesce_bound;

  /* The module uses FF to implement arrays. */
  int use_FF;

  struct autosa_kernel *kernel;

  /* For Catapult HLS */
  /* Pipeline the whole function. */
  int pipeline_at_default_func;
  int pipeline_at_filter_func[3]; // outer, intra, inter  
  /* Fifo guards information. */
  int n_fifo_serialize;
  char** fifo_names_serialize;
  isl_pw_qpolynomial **fifo_bounds_serialize;
  int n_fifo_default;
  char **fifo_names_default;
  isl_pw_qpolynomial **fifo_bounds_default;  
  int n_fifo_inter;
  char **fifo_names_inter;
  isl_pw_qpolynomial **fifo_bounds_inter;  
  int n_fifo_intra;
  char **fifo_names_intra;
  isl_pw_qpolynomial **fifo_bounds_intra;  

  /* Tuning purpose */
  /* Latency */
  isl_schedule *tuning_sched;
  isl_schedule *tuning_outer_sched;
  isl_schedule *tuning_inter_sched;
  isl_schedule *tuning_intra_sched;  

  isl_ast_node *tuning_tree;
  isl_ast_node *tuning_device_tree;  
  isl_ast_node *tuning_intra_tree;
  isl_ast_node *tuning_inter_tree;  
  
  /* Counting module numbers */
  isl_schedule *tuning_num_sched;
  isl_schedule *tuning_num_outer_sched;
  isl_schedule *tuning_num_inter_sched;
  isl_schedule *tuning_num_intra_sched;  

  isl_ast_node *tuning_num_tree;
  isl_ast_node *tuning_num_device_tree;  
  isl_ast_node *tuning_num_intra_tree;
  isl_ast_node *tuning_num_inter_tree;
};

struct autosa_gen
{
  isl_ctx *ctx;
  struct ppcg_options *options;

  /* Callback for printing of AST in appropriate format. */
  __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
                                   struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                   struct autosa_hw_module **modules, int n_modules,
                                   struct autosa_hw_top_module *top_module,
                                   struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                   struct autosa_types *types, void *user);
  void *print_user;

  struct autosa_prog *prog;
  struct autosa_kernel *kernel;
  /* The default AST */
  isl_ast_node *tree;

  /* The default schedule */
  isl_schedule *schedule;

  /* The SA module schedule */
  struct autosa_hw_module **hw_modules;
  int n_hw_modules;
  struct autosa_hw_top_module *hw_top_module;
  struct autosa_drain_merge_func **drain_merge_funcs;
  int n_drain_merge_funcs;

  /* The sequence of types for which a definition has been printed. */
  struct autosa_types types;

  /* User specified tile sizes for each kernel. */
  isl_union_map *sizes;

  /* Effectively used tile sizes for each kernel. */
  isl_union_map *used_sizes;

  /* Identifier of the next kernel. */
  int kernel_id;

  /* Tuning configuration */
  cJSON *tuning_config;

  /* Tuning programs */
  std::vector<TuningProgram *> tuning_progs;
};

/* Representation of special statements, in particular copy statements
 * ,__syncthreads statements, and I/O statements, inside a kernel.
 *
 * type represents the kind of statement
 *
 * for autosa_kernel_copy statements we have
 *
 * read is set if the statement should copy data from global memory
 * to shared memory or registers.
 *
 * index expresses an access to the array element that needs to be copied
 * local_index expresses the corresponding element in the tile
 *
 * array refers to the original array being copied
 * local_array is a pointer to the appropriate element in the "array"
 *	array of the autosa_kernel to which this copy access belongs
 *
 *
 * for autosa_kernel_domain statements we have
 *
 * stmt is the corresponding input statement
 *
 * n_access is the number of accesses in stmt
 * access is an array of local information about the accesses
 *
 * for autosa_kernel_io statements we have
 *
 * in is set if the statement should read data from fifo 
 * to local array or registers.
 *
 * local_index expresses the corresponding element in the tile
 *
 * array refers to the original array being transferred
 * local_array is a pointer to the appropriate element in the "array"
 *  array of the autosa_kernel to which this copy access belongs
 */
struct autosa_kernel_stmt
{
  enum autosa_kernel_stmt_type type;

  union {
    struct
    {
      int read;
      isl_ast_expr *index;
      isl_ast_expr *local_index;
      struct autosa_array_info *array;
      struct autosa_local_array_info *local_array;
    } c;
    struct
    {
      struct autosa_stmt *stmt;
      isl_id_to_ast_expr *ref2expr;
    } d;
    struct
    {
      int in;
      int buf;
      //int filter;
      //int lower;
      int boundary;
      int dummy;
      int serialize;
      int reduce;
      char *in_fifo_name;
      char *out_fifo_name;
      char *fifo_type;
      char *reduce_op;
      int filter_sched_depth;      
      int filter_param_id;
      int data_pack;
      int reg;
      int nxt_data_pack;
      isl_ast_expr *local_index;
      isl_ast_expr *index;
      int coalesce_depth;
      int coalesce_bound;
      struct autosa_array_info *array;
      struct autosa_local_array_info *local_array;
      struct autosa_array_ref_group *group;
      struct autosa_hw_module *module;      
      int simd_depth;
      int if_depth;
    } i;
    struct
    {
      struct autosa_hw_module *module;
      struct autosa_pe_dummy_module *pe_dummy_module;
      struct autosa_array_ref_group *group;
      int boundary;
      int dummy;
      int upper;
      int lower;
      int lower_sched_val;
      int serialize;
      char *module_name;
    } m;
    struct
    {
      struct autosa_hw_module *module;
      int boundary;
    } f;
    struct
    {
      struct autosa_drain_merge_func *func;
      isl_ast_expr *index;
    } dm;
    struct
    {
      isl_ast_expr *index;
      struct autosa_array_ref_group *group;
      int in;
    } s;
  } u;
};

struct autosa_acc
{
  isl_map *tagged_map;
  isl_map *map;
  isl_space *id;

  int rw; // 0 - read 1 - write
};

struct autosa_node_band_prop
{
  int permutable;
  int *coincident;
  enum autosa_loop_type *pe_opt;
  enum autosa_loop_type *space_time;
  int *sched_pos;
  void *iter[20];
  int n_member;
  isl_multi_union_pw_aff *mupa;
};

struct autosa_ast_node_userinfo
{
  int is_pipeline;
  int is_unroll;
  int is_outermost_for;
  int is_infinitize_legal;
  int is_first_infinitizable_loop;
  int is_dep_free;  
  int n_coalesce_loop;
  /* Temporary variable used in AST traversal. */
  bool visited;
  /* Variables for Catapult codegen. */
  int is_guard_start;
  int is_guard_end;
  int n_fifo;
  char **fifo_names;
  isl_pw_qpolynomial **bounds;
  int double_buffer;
  int inter;
  int read;
  char *module_name;
  char *buf_name;
};

/* The current index is such that if you add "shift",
 * then the result is always a multiple of "stride",
 * where "stride" may be equal to 1.
 * Let D represent the initial tile->depth dimensions of the computed schedule.
 * The spaces of "lb" and "shift" are of the form
 *
 *	D -> [b]
 */
struct autosa_array_bound
{
  isl_val *size;
  isl_aff *lb;

  isl_val *stride;
  isl_aff *shift;
};

/* A tile of an outer array.
 *
 * requires_unroll is set if the schedule dimensions that are mapped
 * to threads need to be unrolled for this (private) tile to be used.
 *
 * "depth" reflects the number of schedule dimensions that affect the tile.
 * The copying into and/or out of the tile is performed at that depth.
 *
 * n is the dimension of the array.
 * bound is an array of size "n" representing the lower bound
 *	and size for each index.
 *
 * tiling maps a tile in the global array to the corresponding
 * local memory tile and is of the form
 *
 *	{ [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
 *
 * where D represents the initial "depth" dimensions
 * of the computed schedule.
 */
struct autosa_array_tile
{
  isl_ctx *ctx;
  int requires_unroll;
  int depth;
  int n;
  struct autosa_array_bound *bound;
  isl_multi_aff *tiling;
};

struct hls_info
{
  FILE *host_c;    /* OpenCL host. */
  FILE *host_h;    /* OpenCL host header. */
  FILE *kernel_c;  /* Definition of hardware modules. */
  FILE *kernel_h;  /* Declaration of hardware modules. */
  FILE *top_gen_c; /* Prints out the top module that connects the hardware modules. */
  FILE *top_gen_h;
  FILE *tcl;       /* Catapult TCL. */

  enum platform target;
  int hls;          /* Generate HLS host instead of OpenCL host */
  char *output_dir; /* Output directory */
  char *kernel_prefix; /* Kernel file prefix */
  isl_ctx *ctx;  
  bool hcl; /* Sets to true if the generated code is integrated with HeteroCL. */
  FILE *hcl_decl;
};

/* Band node */
__isl_give isl_multi_val *construct_band_tile_sizes(
    __isl_keep isl_schedule_node *node, int *tile_size);
struct autosa_node_band_prop *extract_node_band_prop(__isl_keep isl_schedule_node *node);
struct autosa_node_band_prop *autosa_node_band_prop_free(
    __isl_take struct autosa_node_band_prop *prop);
isl_bool is_permutable_node(__isl_keep isl_schedule_node *node);
isl_bool has_single_permutable_node(__isl_keep isl_schedule *schedule);
isl_bool is_dep_uniform_at_node(__isl_keep isl_schedule_node *node, void *user);
isl_bool is_dep_uniform(__isl_keep isl_basic_map *bmap, void *user);
isl_bool is_dep_uniform_wrap(__isl_keep isl_map *map, void *user);
isl_bool uniform_dep_check(__isl_keep isl_schedule *schedule, struct ppcg_scop *scop);
__isl_give isl_vec *get_dep_dis_at_schedule(__isl_keep isl_basic_map *dep,
                                            __isl_keep isl_schedule *schedule);
__isl_give isl_vec *get_dep_dis_at_node(__isl_keep isl_basic_map *dep,
                                        __isl_keep isl_schedule_node *band);
//__isl_give isl_schedule *loop_interchange_at_node(
//    __isl_take isl_schedule_node *node, isl_size level1, isl_size level2);
__isl_give isl_schedule_node *loop_interchange_at_node(
    __isl_take isl_schedule_node *node, isl_size level1, isl_size level2);
__isl_give isl_schedule_node *get_outermost_permutable_node(
    __isl_keep isl_schedule *schedule);
__isl_give isl_schedule_node *get_innermost_permutable_node(
    __isl_keep isl_schedule *schedule);
__isl_give isl_schedule_node *tile_band(
    __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes);
__isl_give isl_schedule_node *autosa_tile_band(
    __isl_take isl_schedule_node *node, __isl_keep int *sizes);
__isl_give isl_schedule_node *autosa_node_band_tile_loop(
    __isl_take isl_schedule_node *node, int tile_size, int pos);
__isl_give isl_schedule_node *clear_pe_opt_prop(
    __isl_take isl_schedule_node *node, void *user);
__isl_give isl_schedule_node *restore_node_band_prop(
    __isl_take isl_schedule_node *node,
    __isl_take struct autosa_node_band_prop *prop);
__isl_give isl_schedule_node *autosa_node_interchange(
    __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_node_interchange_up(
    __isl_take isl_schedule_node *node);
isl_bool no_permutable_node(__isl_keep isl_schedule_node *node, void *user);
isl_bool all_parallel_node(__isl_keep isl_schedule_node *node, void *user);
//isl_bool isl_schedule_node_is_io_mark(__isl_keep isl_schedule_node *node, int io_level);
int is_node_under_simd(__isl_keep isl_schedule_node *node);
int is_node_under_latency(__isl_keep isl_schedule_node *node);
int *extract_band_upper_bounds(__isl_keep isl_schedule_node *node);
__isl_give isl_union_set *set_schedule_eq(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names);
__isl_give isl_union_set *set_schedule_neq(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names);    
isl_bool is_flow_dep_carried_by_array_part_loops(__isl_keep isl_schedule *schedule,
                                                 struct autosa_array_ref_group *group, struct autosa_kernel *kernel);
__isl_give isl_schedule_node *reorder_band_by_dep_dis(__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *sched_pos_setup(__isl_take isl_schedule_node *node);
int get_band_single_schedule_val(__isl_keep isl_schedule_node *node);
int get_last_sched_dim_val(__isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_atomic_ancestors(__isl_take isl_schedule_node *node);
int is_dep_carried_by_node(__isl_keep isl_basic_map *dep, __isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_node_sink_to_depth(__isl_take isl_schedule_node *node, int depth);
__isl_give isl_schedule_node *autosa_node_sink_to_mark(__isl_take isl_schedule_node *node, const char *name);
int is_marked(__isl_keep isl_schedule_node *node, const char *name);

/* Schedule */
__isl_give isl_schedule *compute_schedule(struct autosa_gen *gen);
__isl_give isl_schedule *get_schedule(struct autosa_gen *gen);
__isl_give isl_schedule *merge_outer_bands(__isl_give isl_schedule *schedule, struct autosa_gen *gen);

/* AutoSA kernel */
void *autosa_kernel_free(struct autosa_kernel *kernel);
struct autosa_kernel *autosa_kernel_copy(struct autosa_kernel *kernel);
struct autosa_kernel *autosa_kernel_from_schedule(__isl_take isl_schedule *schedule);
struct autosa_kernel *autosa_kernel_alloc(isl_ctx *ctx, struct ppcg_scop *scop);

/* AutoSA access */
isl_bool access_is_stride_zero(__isl_keep isl_map *access, int pos);
isl_bool access_is_stride_one(__isl_keep isl_map *access, int pos);
void *autosa_acc_free(struct autosa_acc *acc);
struct autosa_io_buffer *autosa_io_buffer_alloc();

/* AutoSA dep */
void *autosa_dep_free(__isl_take struct autosa_dep *dep);

/* AutoSA iterator */
struct autosa_iter *autosa_iter_free(struct autosa_iter *iter);

/* AutoSA array */
isl_stat collect_array_info(struct autosa_prog *prog);
int autosa_array_is_read_only_scalar(struct autosa_array_info *array);
int autosa_array_is_scalar(struct autosa_array_info *array);
int autosa_kernel_requires_array_argument(struct autosa_kernel *kernel, int i);
struct autosa_array_ref_group *autosa_array_ref_group_free(
    struct autosa_array_ref_group *group);
struct autosa_array_ref_group *autosa_array_ref_group_init(
    struct autosa_array_ref_group *group);
struct autosa_array_tile *autosa_array_tile_free(struct autosa_array_tile *tile);
struct autosa_array_tile *autosa_array_tile_create(isl_ctx *ctx, int n_index);
__isl_give isl_val *autosa_array_tile_size(struct autosa_array_tile *tile);

/* AutoSA statement */
struct autosa_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
                                  __isl_keep isl_union_map *any_to_outer);
void autosa_kernel_stmt_free(void *user);
struct autosa_stmt *find_stmt(struct autosa_prog *prog, __isl_keep isl_id *id);

/* AutoSA prog */
struct autosa_prog *autosa_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
void *autosa_prog_free(struct autosa_prog *prog);

/* AutoSA hw module */
struct autosa_hw_module *autosa_hw_module_alloc(struct autosa_gen *gen);
void *autosa_hw_module_free(struct autosa_hw_module *module);
struct autosa_hw_top_module *autosa_hw_top_module_alloc();
void *autosa_hw_top_module_free(struct autosa_hw_top_module *module);
struct autosa_pe_dummy_module *autosa_pe_dummy_module_alloc();
void *autosa_pe_dummy_module_free(struct autosa_pe_dummy_module *module);
struct autosa_drain_merge_func *autosa_drain_merge_func_alloc(struct autosa_gen *gen);
void *autosa_drain_merge_func_free(struct autosa_drain_merge_func *func);

/* AutoSA AST node */
struct autosa_ast_node_userinfo *alloc_ast_node_userinfo();
void free_ast_node_userinfo(void *ptr);

/* AutoSA PE opt */
__isl_give isl_set *extract_sa_sizes(__isl_keep isl_union_map *sizes,
                                     const char *type);
int *read_hbm_tile_sizes(struct autosa_kernel *kernel, int tile_len, char *name);
int *read_default_hbm_tile_sizes(struct autosa_kernel *sa, int tile_len);
int *read_array_part_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_default_array_part_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_latency_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_default_latency_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_simd_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_default_simd_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int read_space_time_kernel_id(__isl_keep isl_union_map *sizes);
int *read_array_part_L2_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_default_array_part_L2_tile_sizes(struct autosa_kernel *kernel, int tile_len);
int *read_data_pack_sizes(__isl_keep isl_union_map *sizes, int tile_len);
int *read_data_pack_sizes_array(__isl_keep isl_union_map *sizes, char *name);
int read_mem_port_map(__isl_keep isl_union_map *port_map, char *name);

/* AutoSA latency and resource estimation */
isl_stat sa_extract_loop_info(struct autosa_gen *gen, struct autosa_hw_module *module);
isl_stat sa_extract_array_info(struct autosa_kernel *kernel);
int extract_memory_type(struct autosa_hw_module *module,
                        struct autosa_kernel_var *var, int uram);
isl_stat sa_extract_design_info(struct autosa_gen *gen);

/* Tuning program */
isl_stat TP_extract_loop_info(struct autosa_gen *gen, struct autosa_hw_module *module);
isl_stat TP_extract_resource_info(struct autosa_gen *gen, struct autosa_hw_module *module);
isl_stat TP_extract_module_attr(struct autosa_gen *gen, struct autosa_hw_module *module);
isl_stat TP_extract_array_info(struct autosa_gen *gen, struct autosa_kernel *kernel);
TPArrayTile *TP_infer_tiled_array(
  struct autosa_gen *gen, struct autosa_kernel *kernel, struct isl_schedule_node *node,
  struct autosa_array_ref_group *group, int read, int write);

/* AutoSA block sparsity */
isl_stat autosa_kernel_extract_sparse_info(struct autosa_kernel *kernel, 
  struct autosa_gen *gen);

#if defined(__cplusplus)
}
#endif  

#endif


================================================
FILE: src/autosa_cpu.cpp
================================================


================================================
FILE: src/autosa_cpu.h
================================================
#ifndef _AUTOSA_CPU_H
#define _AUTOSA_CPU_H

#include <isl/ctx.h>

#include "ppcg.h"

struct ppcg_options;

int generate_autosa_cpu(isl_ctx *ctx, struct ppcg_options *options,
												const char *input);

#endif

================================================
FILE: src/autosa_intel_opencl.cpp
================================================
#include <vector>
#include <algorithm>

#include <isl/ctx.h>

#include "autosa_intel_opencl.h"
#include "autosa_common.h"
#include "autosa_print.h"
#include "autosa_trans.h"
#include "autosa_codegen.h"
#include "autosa_utils.h"
#include "autosa_comm.h"

struct print_host_user_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_top_module *top;
};

struct print_hw_module_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_module *module;  
  /* Used for Intel codegen. Modify the printed iterator prefix. */
  const char *iterator_prefix;
};

static void print_intel_host_header(FILE *fp)
{
  fprintf(fp, "#include <stdio.h>\n");
  fprintf(fp, "#include <stdlib.h>\n");
  fprintf(fp, "#include <math.h>\n");
  fprintf(fp, "#include <cassert>\n");
  fprintf(fp, "#include <cstdio>\n");
  fprintf(fp, "#include <cstdlib>\n");
  fprintf(fp, "#include <cstring>\n");
  fprintf(fp, "#include <fstream>\n");
  fprintf(fp, "#include <iomanip>\n");
  fprintf(fp, "#include <iostream>\n");
  fprintf(fp, "#include <sstream>\n");
  fprintf(fp, "#include <string>\n");
  fprintf(fp, "#ifdef _WIN32\n");
  fprintf(fp, "#include <time.h>\n");
  fprintf(fp, "#include <windows.h>\n");
  fprintf(fp, "#else\n");
  fprintf(fp, "#include <sys/time.h>\n");
  fprintf(fp, "#endif\n");
  fprintf(fp, "#include <CL/opencl.h>\n");
  //fprintf(fp, "#include <CL/cl_ext_intelfpga.h>\n");
  fprintf(fp, "#include <chrono>\n");
  fprintf(fp, "#include \"AOCLUtils/aocl_utils.h\"\n\n");

  fprintf(fp, "using namespace aocl_utils;\n\n");
  //  fprintf(fp, "using namespace aocl_utils;\n\n");
  //  fprintf(fp, "#define AOCX_FIEL \"krnl.aocx\"\n\n");

  /* Print Intel helper function */
  fprintf(fp, "#define HOST\n");
  fprintf(fp, "#define ACL_ALIGNMENT 64\n");
  fprintf(fp, "#ifdef _WIN32\n");
  fprintf(fp, "void *acl_aligned_malloc(size_t size) {\n");
  fprintf(fp, "    return _aligned_malloc(size, ACL_ALIGNMENT);\n");
  fprintf(fp, "}\n");
  fprintf(fp, "void acl_aligned_free(void *ptr) {\n");
  fprintf(fp, "    _aligned_free(ptr);\n");
  fprintf(fp, "}\n");
  fprintf(fp, "#else\n");
  fprintf(fp, "void *acl_aligned_malloc(size_t size) {\n");
  fprintf(fp, "    void *result = NULL;\n");
  fprintf(fp, "    if (posix_memalign(&result, ACL_ALIGNMENT, size) != 0)\n");
  fprintf(fp, "        printf(\"acl_aligned_malloc() failed.\\n\");\n");
  fprintf(fp, "    return result;\n");
  fprintf(fp, "}\n");
  fprintf(fp, "void acl_aligned_free(void *ptr) {\n");
  fprintf(fp, "    free(ptr);\n");
  fprintf(fp, "}\n");
  fprintf(fp, "#endif\n\n");

  //fprintf(fp, "$define AOCX_FILE \"krnl.aocx\"\n\n");
  //fprintf(fp, "// Function prototypes\n");
  //fprintf(fp, "void cleanup_host_side_resources();\n");
  //fprintf(fp, "void cleanup();\n\n");

  fprintf(fp, "// Check the status returned by the OpenCL API functions\n");
  fprintf(fp, "#define CHECK(status) \\\n");
  fprintf(fp, "if (status != CL_SUCCESS) { \\\n");
  fprintf(fp, "    fprintf(stderr, \"error %%d in line %%d.\\n\", status, __LINE__); \\\n");
  fprintf(fp, "    exit(1); \\\n");
  fprintf(fp, "}\n\n");

  fprintf(fp, "// Check the status returned by the OpenCL API functions, don't exit on error\n");
  fprintf(fp, "#define CHECK_NO_EXIT(status) \\\n");
  fprintf(fp, "if (status != CL_SUCCESS) { \\\n");
  fprintf(fp, "    fprintf(stderr, \"error %%d in line %%d.\\n\", status, __LINE__); \\\n");
  fprintf(fp, "}\n\n");

  fprintf(fp, "template <typename T>\n");
  fprintf(fp, "struct aligned_allocator\n");
  fprintf(fp, "{\n");
  fprintf(fp, "  using value_type = T;\n");
  fprintf(fp, "  T* allocate(std::size_t num)\n");
  fprintf(fp, "  {\n");
  fprintf(fp, "    void* ptr = nullptr;\n");
  fprintf(fp, "    if (posix_memalign(&ptr, ACL_ALIGNMENT, num*sizeof(T)))\n");
  fprintf(fp, "      throw std::bad_alloc();\n");
  fprintf(fp, "    return reinterpret_cast<T*>(ptr);\n");
  fprintf(fp, "  }\n");
  fprintf(fp, "  void deallocate(T* p, std::size_t num)\n");
  fprintf(fp, "  {\n");
  fprintf(fp, "    free(p);\n");
  fprintf(fp, "  }\n");
  fprintf(fp, "};\n\n");

  fprintf(fp, "void cleanup()\n");
  fprintf(fp, "{\n");
  fprintf(fp, "  // Place holder. Prohibit the function from elimination.\n");
  fprintf(fp, "  printf(\"Cleanup...\\n\");\n");
  fprintf(fp, "}\n\n");
}

/* Open the host .cpp file and the kernel .h and .cpp files for writing.
 * Add the necessary includes.
 */
static void opencl_open_files(struct hls_info *info, const char *input)
{
  char name[PATH_MAX];
  char dir[PATH_MAX];
  int len, len_dir;
  isl_printer *p_str;
  char *file_path;

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/");
  file_path = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  len = ppcg_extract_base_name(name, input);
  /* Add the prefix */
  sprintf(dir, "%s", file_path);
  len_dir = strlen(file_path);

  /* OpenCL host */
  strcpy(name + len, "_host.cpp");
  strcpy(dir + len_dir, name);
  info->host_c = fopen(dir, "w");
  if (!info->host_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_host.h");
  strcpy(dir + len_dir, name);
  info->host_h = fopen(dir, "w");
  print_intel_host_header(info->host_h);
  fprintf(info->host_c, "#include \"%s\"\n", name);
  strcpy(name + len, "_kernel.aocx");
  //fprintf(info->host_c, "#define AOCX_FILE \"%s\"\n", name);

  strcpy(name + len, "_kernel_modules.cl");
  strcpy(dir + len_dir, name);
  info->kernel_c = fopen(dir, "w");
  if (!info->kernel_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_kernel.h");
  strcpy(dir + len_dir, name);
  info->kernel_h = fopen(dir, "w");
  if (!info->kernel_h)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }
  fprintf(info->kernel_c, "#include \"%s\"\n", name);
  fprintf(info->kernel_c, "#include \"ihc_apint.h\"\n");
  //fprintf(info->kernel_c, "#pragma OPENCL EXTENSION cl_intel_channels : enable\n\n");

  strcpy(name + len, "_top_gen.cpp");
  strcpy(dir + len_dir, name);
  info->top_gen_c = fopen(dir, "w");

  strcpy(name + len, "_top_gen.h");
  strcpy(dir + len_dir, name);
  info->top_gen_h = fopen(dir, "w");

  fprintf(info->top_gen_c, "#include <isl/printer.h>\n");
  fprintf(info->top_gen_c, "#include \"%s\"\n", name);

  free(file_path);
}

/* Close all output files. 
 */
static void opencl_close_files(struct hls_info *info)
{
  isl_printer *p_str;
  char *complete;
  FILE *f;

  fclose(info->kernel_c);
  fclose(info->kernel_h);
  fclose(info->host_c);
  if (!info->hls)
  {
    fclose(info->host_h);
  }
  fclose(info->top_gen_c);
  fclose(info->top_gen_h);

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/completed");
  complete = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  f = fopen(complete, "w");
  fclose(f);
  free(complete);
}

/* Extract the data pack factors for each I/O buffer allocated for the current
 * I/O group.
 * Only insert the data pack factor that is not found in the current list
 * "data_pack_factors".
 * The list is in ascending order.
 */
static int *extract_data_pack_factors(int *data_pack_factors,
                                      int *n_factor, struct autosa_array_ref_group *group)
{
  for (int i = 0; i < group->n_io_buffer; i++)
  {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (buf->n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (buf->n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (buf->n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (!insert)
      continue;

    *n_factor = *n_factor + 1;
    data_pack_factors = (int *)realloc(data_pack_factors,
                                       sizeof(int) * (*n_factor));
    for (int j = *n_factor - 1; j > pos; j--)
    {
      data_pack_factors[j] = data_pack_factors[j - 1];
    }
    data_pack_factors[pos] = buf->n_lane;
  }

  return data_pack_factors;
}

/* Examine the local buffers of each array group. 
 * Extract the data pack factors and build the data types 
 * required by the program. 
 * For Intel devices, we use the vectorized data types.
 */
static isl_stat print_data_types_intel(
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_printer *p;
  struct autosa_kernel *kernel;

  kernel = top->kernel;
  p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "/* Data Type */");

  /* Print the primitive data type. */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "typedef ");
    p = isl_printer_print_str(p, local->array->type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, local->array->name);
    p = isl_printer_print_str(p, "_t1;");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    int *data_pack_factors = (int *)malloc(sizeof(int));
    int n_factor = 1;
    /* First insert the default data pack factor for the array. */
    data_pack_factors[0] = local->n_lane;

    /* IO group */
    for (int n = 0; n < local->n_io_group; n++)
    {
      struct autosa_array_ref_group *group = local->io_groups[n];
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, group);
    }
    /* Drain group */
    if (local->drain_group)
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, local->drain_group);

    for (int n = 0; n < n_factor; n++)
    {
      if (data_pack_factors[n] != 1)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "struct ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, "_t {");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, 2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, local->array->type);
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, " data;");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "};");

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "typedef struct ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, "_t ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    }
    free(data_pack_factors);
  }
  p = print_str_new_line(p, "/* Data Type */");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print the arguments to a drain merge function declaration or call.
 * If "types" is set, then print a declaration (including the types of the arguments).
 * 
 * The arguments are printed in the following order:
 * - the module identifiers
 * - the parameters
 * - the host loop iterators
 * - the arrays accssed by the module
 */
//static __isl_give isl_printer *print_drain_merge_arguments_intel(
//    __isl_take isl_printer *p,
//    struct autosa_kernel *kernel,
//    struct autosa_array_ref_group *group,
//    struct autosa_drain_merge_func *func,
//    int types,
//    int hls)
//{
//  int first = 1;
//  int nparam;
//  int n;
//  isl_space *space;
//  const char *type;
//  struct autosa_local_array_info *local_array;
//
//  type = isl_options_get_ast_iterator_type(kernel->ctx);
//  /* module identifiers */
//  const char *dims[] = {"idx", "idy", "idz"};
//  n = isl_id_list_n_id(func->inst_ids);
//  for (int i = 0; i < n; ++i)
//  {
//    if (!first)
//      p = isl_printer_print_str(p, ", ");
//    if (types)
//    {
//      p = isl_printer_print_str(p, type);
//      p = isl_printer_print_str(p, " ");
//    }
//    p = isl_printer_print_str(p, dims[i]);
//
//    first = 0;
//  }
//
//  /* params */
//  space = isl_union_set_get_space(kernel->arrays);
//  nparam = isl_space_dim(space, isl_dim_param);
//  for (int i = 0; i < nparam; ++i)
//  {
//    const char *name;
//
//    name = isl_space_get_dim_name(space, isl_dim_param, i);
//
//    if (!first)
//      p = isl_printer_print_str(p, ", ");
//    if (types)
//      p = isl_printer_print_str(p, "int ");
//    p = isl_printer_print_str(p, name);
//
//    first = 0;
//  }
//  isl_space_free(space);
//
//  /* Host iters */
//  n = isl_space_dim(kernel->space, isl_dim_set);
//  for (int i = 0; i < n; ++i)
//  {
//    const char *name;
//
//    if (!first)
//      p = isl_printer_print_str(p, ", ");
//    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
//    if (types)
//    {
//      p = isl_printer_print_str(p, type);
//      p = isl_printer_print_str(p, " ");
//    }
//    p = isl_printer_print_str(p, name);
//
//    first = 0;
//  }
//
//  /* Arrays */
//  local_array = group->local_array;
//  if (!first)
//    p = isl_printer_print_str(p, ", ");
//  if (types)
//  {
//    if (hls)
//    {
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, " *");
//    }
//    else
//    {
//      p = isl_printer_print_str(p, "std::vector<");
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, ", aligned_allocator<");
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, ">> &");
//    }
//    p = isl_printer_print_str(p, local_array->array->name);
//    p = isl_printer_print_str(p, "_to");
//  }
//  else
//  {
//    p = isl_printer_print_str(p, "dev_");
//    p = isl_printer_print_str(p, local_array->array->name);
//    p = isl_printer_print_str(p, "[0]");
//  }
//  first = 0;
//
//  if (!first)
//    p = isl_printer_print_str(p, ", ");
//  if (types)
//  {
//    if (hls)
//    {
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, " *");
//    }
//    else
//    {
//      p = isl_printer_print_str(p, "std::vector<");
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, ", aligned_allocator<");
//      p = isl_printer_print_str(p, local_array->array->type);
//      p = isl_printer_print_str(p, ">> &");
//    }
//    p = isl_printer_print_str(p, local_array->array->name);
//    p = isl_printer_print_str(p, "_from");
//  }
//  else
//  {
//    p = isl_printer_print_str(p, "dev_");
//    p = isl_printer_print_str(p, local_array->array->name);
//    p = isl_printer_print_str(p, "[idx]");
//  }
//  first = 0;
//
//  return p;
//}

static __isl_give isl_printer *print_for_with_coalesce(__isl_keep isl_ast_node *node,
                                                       __isl_take isl_printer *p,
                                                       __isl_take isl_ast_print_options *print_options,
                                                       int n_coalesce_loop)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma loop_coalesce");
  if (n_coalesce_loop > 0) {
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_int(p, n_coalesce_loop);
  }
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_infinitize(__isl_keep isl_ast_node *node,
                                                    __isl_take isl_printer *p,
                                                    __isl_take isl_ast_print_options *print_options,
                                                    int is_first)
{
  isl_ast_node *body;

  if (is_first) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "while (1) {");    
    p = isl_printer_end_line(p);    
    p = isl_printer_indent(p, 2);
  }

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  if (is_first) {    
    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }

  return p;
}                                                  

static __isl_give isl_printer *print_module_for(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int outermost_for;
  int infinitize, is_first_infinitize;
  int n_coalesce_loop;
  int is_dep_free;

  outermost_for = 0;
  infinitize = 0;
  is_first_infinitize = 0;
  id = isl_ast_node_get_annotation(node);
  if (id)
  {
    struct autosa_ast_node_userinfo *info;
    info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
    if (info && info->is_outermost_for)
      outermost_for = 1;
    if (info && info->is_infinitize_legal) {
      infinitize = 1;
      is_first_infinitize = info->is_first_infinitizable_loop;
    }
    n_coalesce_loop = info->n_coalesce_loop;
    is_dep_free = info->is_dep_free;
  }
  
  if (infinitize)
    p = print_for_infinitize(node, p, print_options, is_first_infinitize);
  else if (outermost_for || n_coalesce_loop > 1) {
    if (is_dep_free == 1) {
      p = print_str_new_line(p, "#pragma ivdep");
    }
    p = print_for_with_coalesce(node, p, print_options, n_coalesce_loop);
  } else {
    p = isl_ast_node_for_print(node, p, print_options);
  }

  isl_id_free(id);

  return p;
}

//static __isl_give isl_printer *print_module_stmt(__isl_take isl_printer *p,
//                                                 __isl_take isl_ast_print_options *print_options,
//                                                 __isl_keep isl_ast_node *node, void *user)
//{
//  isl_id *id;
//  struct autosa_kernel_stmt *stmt;
//  struct print_hw_module_data *hw_data = (struct print_hw_module_data *)(user);
//  struct autosa_hw_module *module = hw_data->module;
//
//  id = isl_ast_node_get_annotation(node);
//  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
//  isl_id_free(id);
//
//  isl_ast_print_options_free(print_options);
//
//  switch (stmt->type)
//  {
//    //    case POLYSA_KERNEL_STMT_COPY:
//    //      return autosa_kernel_print_copy(p, stmt);
//    //    case POLYSA_KERNEL_STMT_SYNC:
//    //      return print_sync(p, stmt);
//  case AUTOSA_KERNEL_STMT_DOMAIN:
//    return autosa_kernel_print_domain(p, stmt);
//  case AUTOSA_KERNEL_STMT_IO:
//    return autosa_kernel_print_io(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_TRANSFER:
//    return autosa_kernel_print_io_transfer(p, stmt, hw_data->hls, 
//              module->options->autosa->double_buffer_style == 0?
//                hw_data->iterator_prefix : NULL);
//  case AUTOSA_KERNEL_STMT_IO_DRAM:
//    return autosa_kernel_print_io_dram(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS:
//    return autosa_kernel_print_inter_trans(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS:
//    return autosa_kernel_print_intra_trans(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA:
//    return autosa_kernel_print_inter_intra(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER:
//    return autosa_kernel_print_intra_inter(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_STATE_HANDLE:
//    return autosa_kernel_print_state_handle(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_DRAIN_MERGE:
//    return autosa_kernel_print_drain_merge(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_HOST_SERIALIZE:
//    return autosa_kernel_print_host_serialize(p, stmt, hw_data->hls);
//  }
//
//  return p;
//}

/* Print the host serialization functions.
 */
//static isl_stat print_host_serialize_funcs(
//    struct autosa_kernel *kernel,
//    struct autosa_hw_module **modules,
//    int n_modules, struct hls_info *hls)
//{
//  isl_printer *p;
//  isl_ctx *ctx;
//
//  ctx = kernel->ctx;
//  if (!hls->hls)
//    p = isl_printer_to_file(ctx, hls->host_h);
//  else
//    p = isl_printer_to_file(ctx, hls->kernel_h);
//  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
//  for (int i = 0; i < n_modules; i++) {
//    struct autosa_hw_module *module = modules[i];
//    isl_ast_print_options *print_options;
//    struct print_hw_module_data hw_data = {hls, NULL, NULL, NULL};
//
//    if (module->serialize_tree) {
//      p = print_str_new_line(p, "/* Helper Function */");
//      p = isl_printer_start_line(p);
//      if (hls->hls)
//        p = isl_printer_print_str(p, "inline ");
//      p = isl_printer_print_str(p, "void ");
//      if (module->in) {
//        p = isl_printer_print_str(p, "host_serialize_");
//      } else {
//        p = isl_printer_print_str(p, "host_deserialize_");
//      }      
//      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      p = isl_printer_print_str(p, "(");      
//      p = print_host_serialize_arguments(p, kernel, module->io_groups[0], module, 1, hls->hls);
//      p = isl_printer_print_str(p, "){");
//      p = isl_printer_end_line(p);
//      p = isl_printer_indent(p, 2);
//
//      p = print_str_new_line(p, "/* Variable Declaration */");
//      p = print_str_new_line(p, "unsigned int cnt = 0;");      
//      p = print_str_new_line(p, "/* Variable Declaration */");
//      p = isl_printer_end_line(p);
//
//      print_options = isl_ast_print_options_alloc(ctx);
//      print_options = isl_ast_print_options_set_print_user(print_options,
//                                                           &print_module_stmt, &hw_data);
//      p = isl_ast_node_print(module->serialize_tree, p, print_options);
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//      p = print_str_new_line(p, "/* Helper Function */");
//      p = isl_printer_end_line(p);
//    }    
//  }
//  isl_printer_free(p);
//
//  return isl_stat_ok;
//}

/* For each io_module connected to the external memory, we will need to create 
 * one separate queue assoicated with separate OpenCL kernels.
 */
static __isl_give isl_printer *find_device_intel(__isl_take isl_printer *p,
                                                 struct autosa_hw_top_module *top)
{
  int n_cmd_q;
  int n_kernel;
  int indent;

  p = print_str_new_line(p, "// OpenCL host code starts from here");
  //p = print_str_new_line(p, "bool use_emulator = false; // control whether the emulator should be used.");
  p = print_str_new_line(p, "if (argc != 2) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "std::cout << \"Usage: \" << argv[0] << \" <path/to/bitstream.aocx>\" << std::endl;");
  p = print_str_new_line(p, "return -1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = print_str_new_line(p, "cl_int status;");
  p = print_str_new_line(p, "cl_platform_id platform = NULL;");
  p = print_str_new_line(p, "cl_device_id *devices = NULL;");
  p = print_str_new_line(p, "cl_context context = NULL;");
  p = print_str_new_line(p, "cl_program program = NULL;");
  p = print_str_new_line(p, "std::string binary_file = argv[1];");

  int q_id = 0;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->type == PE_MODULE || module->to_mem == 0)
      continue;
    struct autosa_array_ref_group *group = module->io_groups[0];

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ID_");
    p = isl_printer_print_str(p, module->name);
    p = isl_printer_print_str(p, "_base = ");
    p = isl_printer_print_int(p, q_id);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
    q_id += group->n_mem_ports;
  }

  n_cmd_q = q_id;
  n_kernel = n_cmd_q;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int NUM_QUEUES_TO_CREATE = ");
  p = isl_printer_print_int(p, n_cmd_q);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int NUM_KERNELS_TO_CREATE = ");
  p = isl_printer_print_int(p, n_kernel);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "cl_kernel kernel[NUM_KERNELS_TO_CREATE];");
  p = print_str_new_line(p, "cl_command_queue cmdQueue[NUM_QUEUES_TO_CREATE];");

  p = isl_printer_end_line(p);
//  p = print_str_new_line(p, "// Parse command line arguments");
//  p = print_str_new_line(p, "Options options(argc, argv);");
//  p = print_str_new_line(p, "if (options.has(\"emulator\")) {");
//  p = isl_printer_indent(p, 2);
//  p = print_str_new_line(p, "use_emulator = options.get<bool>(\"emulator\");");
//  p = isl_printer_indent(p, -2);
//  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "if (!setCwdToExeDir()) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "return false;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Get the OpenCL platform");
  //p = print_str_new_line(p, "if (use_emulator) {");
  //p = isl_printer_indent(p, 2);
  //p = print_str_new_line(p, "platform = findPlatform(\"Intel(R) FPGA Emulation Platform for OpenCL(TM)\");");
  //p = isl_printer_indent(p, -2);
  //p = print_str_new_line(p, "} else {");
  //p = isl_printer_indent(p, 2);
  //p = print_str_new_line(p, "platform = findPlatform(\"Intel(R) FPGA SDK for OpenCL(TM)\");");
  //p = isl_printer_indent(p, -2);
  //p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "platform = findPlatform(\"Intel\");");
  p = print_str_new_line(p, "if (platform == NULL) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "printf(\"ERROR: Unable to find Intel(R) FPGA OpenCL platform\\n\");");
  p = print_str_new_line(p, "return -1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Discover and initialize the devices");
  p = print_str_new_line(p, "cl_uint numDevices = 0;");
  p = print_str_new_line(p, "char buffer[4096];");
  p = print_str_new_line(p, "unsigned int buf_uint;");
  p = print_str_new_line(p, "int device_found = 0;");
  p = print_str_new_line(p, "status = clGetDeviceIDs(platform,");
  p = isl_printer_indent(p, strlen("status = clGetDeviceIDs("));
  p = print_str_new_line(p, "CL_DEVICE_TYPE_ALL,");
  p = print_str_new_line(p, "0,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "&numDevices);");
  indent = strlen("status = clGetDeviceIDs(");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "if (status == CL_SUCCESS) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "clGetPlatformInfo(platform,");
  p = isl_printer_indent(p, strlen("clGetPlatformInfo("));
  p = print_str_new_line(p, "CL_PLATFORM_VENDOR,");
  p = print_str_new_line(p, "4096,");
  p = print_str_new_line(p, "buffer,");
  p = print_str_new_line(p, "NULL);");
  indent = strlen("clGetPlatformInfo(");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "if (strstr(buffer, \"Intel(R)\") != NULL) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "device_found = 1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "if (device_found) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "devices = (cl_device_id*) acl_aligned_malloc(numDevices * sizeof(cl_device_id));");
  p = print_str_new_line(p, "status = clGetDeviceIDs(platform,");
  p = isl_printer_indent(p, strlen("status = clGetDeviceIDs("));
  p = print_str_new_line(p, "CL_DEVICE_TYPE_ALL,");
  p = print_str_new_line(p, "numDevices,");
  p = print_str_new_line(p, "devices,");
  p = print_str_new_line(p, "NULL);");
  indent = strlen("status = clGetDeviceIDs(");
  p = isl_printer_indent(p, -indent);
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "if (!device_found) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "printf(\"failed to find a OpenCL device\\n\");");
  p = print_str_new_line(p, "exit(1);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = print_str_new_line(p, "for (int i = 0; i < numDevices; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "clGetDeviceInfo(devices[i],");
  indent = strlen("clGetDeviceInfo(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "CL_DEVICE_NAME,");
  p = print_str_new_line(p, "4096,");
  p = print_str_new_line(p, "buffer,");
  p = print_str_new_line(p, "NULL);");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "fprintf(stdout, \"\\nDevice Name: %s\\n\", buffer);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "clGetDeviceInfo(devices[i],");
  indent = strlen("clGetDeviceInfo(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "CL_DEVICE_VENDOR,");
  p = print_str_new_line(p, "4096,");
  p = print_str_new_line(p, "buffer,");
  p = print_str_new_line(p, "NULL);");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "fprintf(stdout, \"Device Vendor: %s\\n\", buffer);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "clGetDeviceInfo(devices[i],");
  indent = strlen("clGetDeviceInfo(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "CL_DEVICE_MAX_COMPUTE_UNITS,");
  p = print_str_new_line(p, "sizeof(buf_uint),");
  p = print_str_new_line(p, "&buf_uint,");
  p = print_str_new_line(p, "NULL);");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "fprintf(stdout, \"Device Computing Units: %u\\n\", buf_uint);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "clGetDeviceInfo(devices[i],");
  indent = strlen("clGetDeviceInfo(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "CL_DEVICE_GLOBAL_MEM_SIZE,");
  p = print_str_new_line(p, "sizeof(unsigned long),");
  p = print_str_new_line(p, "&buffer,");
  p = print_str_new_line(p, "NULL);");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "fprintf(stdout, \"Global Memory Size: %lu\\n\", *((unsigned long*)buffer));");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "clGetDeviceInfo(devices[i],");
  indent = strlen("clGetDeviceInfo(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "CL_DEVICE_MAX_MEM_ALLOC_SIZE,");
  p = print_str_new_line(p, "sizeof(unsigned long),");
  p = print_str_new_line(p, "&buffer,");
  p = print_str_new_line(p, "NULL);");
  p = isl_printer_indent(p, -indent);
  p = print_str_new_line(p, "fprintf(stdout, \"Global Memory Allocation Size: %lu\\n\\n\", *((unsigned long*)buffer));");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  /* Context */
  p = print_str_new_line(p, "// Create a context");
  p = print_str_new_line(p, "context = clCreateContext(NULL,");
  indent = strlen("context = clCreateContext(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "1,");
  p = print_str_new_line(p, "devices,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "&status); CHECK(status);");
  p = isl_printer_indent(p, -indent);
  p = isl_printer_end_line(p);

  /* Command Queue */
  p = print_str_new_line(p, "// Create command queues");
  p = print_str_new_line(p, "for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "cmdQueue[i] = clCreateCommandQueue(context,");
  indent = strlen("cmdQueue[i] = clCreateCommandQueue(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "devices[0],");
  p = print_str_new_line(p, "CL_QUEUE_PROFILING_ENABLE,");
  p = print_str_new_line(p, "&status); CHECK(status);");
  p = isl_printer_indent(p, -indent);
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  /* Create the program from binaries */
  p = print_str_new_line(p, "// Create the program from binaries");
  p = print_str_new_line(p, "size_t binary_length;");
  p = print_str_new_line(p, "const unsigned char *binary;");
  p = print_str_new_line(p, "printf(\"\\nAOCX file: %s\\n\\n\", binary_file.c_str());");
  p = print_str_new_line(p, "FILE *fp = fopen(binary_file.c_str(), \"rb\");");
  p = print_str_new_line(p, "if (fp == NULL) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "printf(\"Failed to open the AOCX file (fopen).\\n\");");
  p = print_str_new_line(p, "return -1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "fseek(fp, 0, SEEK_END);");
  p = print_str_new_line(p, "long ftell_sz = ftell(fp);");
  p = print_str_new_line(p, "if (ftell_sz < 0) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "printf(\"ftell returns a negative value.\\n\");");
  p = print_str_new_line(p, "fclose(fp);");
  p = print_str_new_line(p, "return -1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "} else {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "binary_length = ftell_sz;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "binary = (unsigned char *)malloc(sizeof(unsigned char) * binary_length);");
  p = print_str_new_line(p, "rewind(fp);");
  p = print_str_new_line(p, "size_t fread_sz = fread((void *)binary, binary_length, 1, fp);");
  p = print_str_new_line(p, "if (fread_sz == 0) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "printf(\"Failed to read from the AOCX file (fread).\\n\");");
  p = print_str_new_line(p, "fclose(fp);");
  p = print_str_new_line(p, "free(const_cast<unsigned char *>(binary));");
  p = print_str_new_line(p, "return -1;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "fclose(fp);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "program = clCreateProgramWithBinary(context,");
  indent = strlen("program = clCreateProgramWithBinary(");
  p = isl_printer_indent(p, indent);
  p = print_str_new_line(p, "1,");
  p = print_str_new_line(p, "devices,");
  p = print_str_new_line(p, "&binary_length,");
  p = print_str_new_line(p, "(const unsigned char **)&binary,");
  p = print_str_new_line(p, "&status,");
  p = print_str_new_line(p, "NULL); CHECK(status);");
  p = isl_printer_indent(p, -indent);
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);");
  p = print_str_new_line(p, "if (status != CL_SUCCESS) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "char log[10000] = {0};");
  p = print_str_new_line(p, "clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 10000, log, NULL);");
  p = print_str_new_line(p, "printf(\"%s\\n\", log);");
  p = print_str_new_line(p, "CHECK(status);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  /* Create the kernel */
  p = print_str_new_line(p, "// Create the kernel");
  int k_id = 0;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->type == PE_MODULE || module->to_mem == 0)
      continue;
    struct autosa_array_ref_group *group = module->io_groups[0];

    for (int j = 0; j < group->n_mem_ports; j++)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "kernel[");
      p = isl_printer_print_str(p, "ID_");
      p = isl_printer_print_str(p, module->name);
      p = isl_printer_print_str(p, "_base");
      if (group->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, " + ");
        p = isl_printer_print_int(p, j);
      }
      p = isl_printer_print_str(p, "] = clCreateKernel(program, \"");
      p = isl_printer_print_str(p, module->name);
      if (module->boundary && !module->device_tree)
        p = isl_printer_print_str(p, "_boundary");
      if (module->is_serialized) 
        p = isl_printer_print_str(p, "_serialize");
      if (group->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_int(p, j);
      }
      p = isl_printer_print_str(p, "\", &status);");
      p = isl_printer_end_line(p);
      p = print_str_new_line(p, "CHECK(status);");
      k_id++;
    }
  }

  return p;
}

static __isl_give isl_printer *declare_and_allocate_device_arrays_intel(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_kernel *kernel, struct autosa_hw_top_module *top)
{
  int indent;
  p = print_str_new_line(p, "// Allocate memory in host memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      /* Create multiple host buffers. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">>> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ".push_back(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (local_array->host_serialize) {
        /* Allocate additional serialize buffer. */
        /* Create multiple host buffers. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">>> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);      
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp");
        p = isl_printer_print_str(p, "(");
        // p = autosa_array_info_print_data_size(p, local_array->array); // TODO
        //p = isl_printer_print_ast_expr(p, local_array->serialize_bound_expr);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, ".push_back(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }
    else
    {
      /* Create a single host buffer. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        /* Create a single host buffer. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);      
        p = isl_printer_print_str(p, "(");
        //p = autosa_array_info_print_data_size(p, local_array->array);
        //p = isl_printer_print_ast_expr(p, local_array->serialize_bound_expr);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  /* Initialize buffer. */
  p = print_str_new_line(p, "// Initialize host buffers");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "[i]");    
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);
    }
  }  

  /* Perform data serialization if needed. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      if (local_array->n_mem_ports > 1 && local_array->array->copy_out)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        p = isl_printer_start_line(p);        
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);            
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);  // TODO: add hbm support later.
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      } else 
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Allocate buffers in device memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "std::vector<cl_mem> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    int indent1, indent2;
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    //for (int j = 0; j < local_array->n_mem_ports; j++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "cl_mem buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp = clCreateBuffer");
    p = isl_printer_print_str(p, "(context,");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, strlen("cl_mem buffer_") +
                                  strlen(local_array->array->name) + strlen("_tmp") + strlen(" = clCreateBuffer("));
    p = isl_printer_start_line(p);
    if (local_array->array->copy_in && local_array->array->copy_out)
    {
      p = isl_printer_print_str(p, "CL_MEM_READ_WRITE");
    }
    else
    {
      if (local_array->array->copy_in)
        p = isl_printer_print_str(p, "CL_MEM_READ_ONLY");
      else if (local_array->array->copy_out)
        p = isl_printer_print_str(p, "CL_MEM_WRITE_ONLY");
    }
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_size(p, local_array->array);      
    } else {
      p = autosa_array_info_print_size(p, local_array->array);
    }
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "NULL,");
    //p = isl_printer_start_line(p);
    //p = isl_printer_print_str(p, "dev_");
    //p = isl_printer_print_str(p, local_array->array->name);
    //if (local_array->n_mem_ports > 1 && local_array->array->copy_out) {
    //  p = isl_printer_print_str(p, "[i]");
    //}
    //p = isl_printer_print_str(p, ".data(),");
    //p = isl_printer_end_line(p);
    p = print_str_new_line(p, "&status); CHECK(status);");
    p = isl_printer_indent(p, -(strlen("cl_mem buffer_") +
                                strlen(local_array->array->name) + strlen("_tmp") + strlen(" = clCreateBuffer(")));

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ".push_back(std::move(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp));");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  p = isl_printer_end_line(p);

  /* Insert profiling information. */
  p = print_str_new_line(p, "auto host_begin = std::chrono::high_resolution_clock::now();");
  p = print_str_new_line(p, "auto fpga_begin = std::chrono::high_resolution_clock::now();");
  p = print_str_new_line(p, "auto fpga_end = std::chrono::high_resolution_clock::now();");
  p = isl_printer_end_line(p);

  return p;
}

/* Print code for initializing the device for execution of the transformed
 * code. This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device_intel(__isl_take isl_printer *p,
                                                 struct autosa_prog *prog, 
                                                 struct autosa_kernel *kernel, 
                                                 int hls,
                                                 struct autosa_hw_top_module *top)
{
  p = autosa_print_local_declarations(p, prog);

  p = find_device_intel(p, top);
  p = declare_and_allocate_device_arrays_intel(p, prog, kernel, top);

  return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device_intel(__isl_take isl_printer *p,
                                                  struct autosa_prog *prog,
                                                  int hls,
                                                  struct autosa_hw_top_module *top)
{
  /* Profiling results */
  p = print_str_new_line(p, "for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "status = clFinish(cmdQueue[i]); CHECK(status);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = print_str_new_line(p, "auto host_end = std::chrono::high_resolution_clock::now();");
  p = isl_printer_end_line(p);
  p = print_str_new_line(p, "// Calculate time");
  p = print_str_new_line(p, "std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;");
  p = print_str_new_line(p, "std::cout << \"FPGA Time: \" << fpga_duration.count() << \" s\" << std::endl;");
  p = print_str_new_line(p, "std::chrono::duration<double> host_duration = host_end - host_begin;");
  p = print_str_new_line(p, "std::cout << \"Host Time: \" << host_duration.count() << \" s\" << std::endl;");
  p = isl_printer_end_line(p);

  /* Deserialize the buffer data if necessary. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && !module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "host_deserialize_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "(");      
      p = print_host_serialize_arguments(p, top->kernel, group, module, 0, 0);  // TODO: add hbm support later.
      p = isl_printer_print_str(p, ");");      
      p = isl_printer_end_line(p);
    }
  }

  /* Restore buffer */
  p = print_str_new_line(p, "// Restore data from host buffers");
  for (int i = 0; i < prog->n_array; i++)
  {
    struct autosa_array_info *array = &prog->array[i];
    if (!autosa_array_requires_device_allocation(array))
      continue;

    if (array->copy_out)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(dev_");
      p = isl_printer_print_str(p, array->name);
      if (array->local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      if (array->local_array->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, "[0]");
      }
      p = isl_printer_print_str(p, ".begin(), dev_");
      p = isl_printer_print_str(p, array->name);
      if (array->local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      if (array->local_array->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, "[0]");
      }
      p = isl_printer_print_str(p, ".end(), reinterpret_cast<");
      p = isl_printer_print_str(p, array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);
    }
  }
  p = isl_printer_end_line(p);

  /* Clean up OpenCL resources */
  p = print_str_new_line(p, "// Clean up OpenCL resources");
  p = print_str_new_line(p, "for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "clReleaseKernel(kernel[i]);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);
  p = print_str_new_line(p, "for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "clReleaseCommandQueue(cmdQueue[i]);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);
    
  p = print_str_new_line(p, "#ifndef EMULATE");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "clReleaseProgram(program);");
  p = print_str_new_line(p, "clReleaseContext(context);");
  p = print_str_new_line(p, "acl_aligned_free(devices);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "#endif");

  return p;
}

static __isl_give isl_printer *drain_merge_intel(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_drain_merge_func *func,
    int hls)
{
  struct autosa_array_ref_group *group = func->group;
  p = print_str_new_line(p, "// Merge results");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int idx = ");
  p = isl_printer_print_int(p, group->mem_port_id);
  p = isl_printer_print_str(p, "; idx < ");
  p = isl_printer_print_int(p, group->mem_port_id + group->n_mem_ports);
  p = isl_printer_print_str(p, "; idx++) {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = autosa_array_ref_group_print_prefix(group, p);
  p = isl_printer_print_str(p, "_drain_merge(");
  p = print_drain_merge_arguments(p, func->kernel, group, func, 0, hls);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_to_device_intel(__isl_take isl_printer *p,
                                                          struct autosa_array_info *array)
{
  int indent;
  struct autosa_local_array_info *local_array = array->local_array;

  p = print_str_new_line(p, "// Write host data to device buffers");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int i = 0; i < ");
  p = isl_printer_print_int(p, local_array->n_mem_ports);
  p = isl_printer_print_str(p, "; i++) {");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);

  p = print_str_new_line(p, "status = clEnqueueWriteBuffer(");
  indent = strlen("status = clEnqueueWriteBuffer(");
  p = isl_printer_indent(p, indent);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "cmdQueue[0],");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "buffer_");
  p = isl_printer_print_str(p, array->name);
  p = isl_printer_print_str(p, "[i],");
  p = isl_printer_end_line(p);
  p = print_str_new_line(p, "CL_TRUE,");
  p = print_str_new_line(p, "0,");
  p = isl_printer_start_line(p);
  if (local_array->host_serialize) {
    p = autosa_array_info_print_serialize_size(p, array);
  } else {
    p = autosa_array_info_print_size(p, array);
  }
  p = isl_printer_print_str(p, ",");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "dev_");
  p = isl_printer_print_str(p, array->name);
  if (local_array->n_mem_ports > 1 && array->copy_out)
  {
    p = isl_printer_print_str(p, "[i]");
  }
  p = isl_printer_print_str(p, ".data(),");
  p = isl_printer_end_line(p);
  p = print_str_new_line(p, "0,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "NULL); CHECK(status);");
  p = isl_printer_indent(p, -indent);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  return p;
}

/* Print code to "p" for copying "array" back from the device to the host
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * polysa_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_from_device_intel(
    __isl_take isl_printer *p, struct autosa_array_info *array)
{
  struct autosa_local_array_info *local_array;
  int indent;

  local_array = array->local_array;
  p = print_str_new_line(p, "// Read the results back from the device");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int i = 0; i < ");
  p = isl_printer_print_int(p, local_array->n_io_group_refs);
  p = isl_printer_print_str(p, "; i++) {");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);

  p = print_str_new_line(p, "clEnqueueReadBuffer(");
  indent = strlen("clEnqueueReadBuffer(");
  p = isl_printer_indent(p, indent);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "cmdQueue[0],");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "buffer_");
  p = isl_printer_print_str(p, array->name);
  p = isl_printer_print_str(p, "[i],");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "CL_TRUE,");
  p = print_str_new_line(p, "0,");
  p = isl_printer_start_line(p);
  if (local_array->host_serialize) {
    p = autosa_array_info_print_serialize_size(p, array);
  } else {
    p = autosa_array_info_print_size(p, array);
  }
  p = isl_printer_print_str(p, ",");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "dev_");
  p = isl_printer_print_str(p, array->name);
  if (local_array->n_mem_ports > 1 && array->copy_out)
  {
    p = isl_printer_print_str(p, "[i]");
  }
  p = isl_printer_print_str(p, ".data(),");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "0,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "NULL); CHECK(status);");

  p = isl_printer_indent(p, -indent);
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the autosa_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node_intel(__isl_take isl_printer *p,
                                                       __isl_keep isl_ast_node *node, 
                                                       struct autosa_prog *prog, 
                                                       int hls,
                                                       struct autosa_hw_top_module *top)
{
  isl_ast_expr *expr, *arg;
  isl_id *id;
  const char *name;
  struct autosa_array_info *array;
  struct autosa_kernel *kernel;
  struct autosa_drain_merge_func *func;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  if (!strcmp(name, "init_device") || !strcmp(name, "clear_device"))
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
  else if (!strcmp(name, "drain_merge"))
    func = (struct autosa_drain_merge_func *)isl_id_get_user(id);
  else
    array = (struct autosa_array_info *)isl_id_get_user(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  isl_ast_expr_free(expr);

  if (!name)
    return isl_printer_free(p);
  if (!strcmp(name, "init_device"))
    return init_device_intel(p, prog, kernel, hls, top);
  if (!strcmp(name, "clear_device"))
    return clear_device_intel(p, prog, hls, top);
  if (!strcmp(name, "drain_merge"))
    return drain_merge_intel(p, prog, func, hls);
  if (!array)
    return isl_printer_free(p);

  if (!prefixcmp(name, "to_device"))
    return copy_array_to_device_intel(p, array);
  else
    return copy_array_from_device_intel(p, array);
}

/* Print out the statements for setting the OpenCL arguments for the io
 * modules connected to the external memory. 
 * - set_ext_module_args_upper
 * - set_ext_module_args_lower
 * 
 * This function only works for Intel OpenCL.
 * Originally, for each module, we have the following arguments:
 * - the module identifiers
 * - the paramters
 * - the host loop iterators
 * - the array accessed by the modules
 * - the fifos
 * - the enable signal
 * 
 * We will ignore the fifos since for Intel OpenCL designs will replace these 
 * fifos later with channels.
 */
static __isl_give isl_printer *autosa_kernel_print_set_ext_module_args(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog)
{
  int upper = stmt->u.m.upper;
  int lower = stmt->u.m.lower;
  int complete = (upper == 0 && lower == 0);
  int dummy = stmt->u.m.dummy;
  int boundary = stmt->u.m.boundary;
  char *module_name = stmt->u.m.module_name;
  struct autosa_hw_module *module = stmt->u.m.module;
  int n_arg = 0;
  struct autosa_kernel *kernel = module->kernel;

  isl_space *space;
  int nparams;
  int n;
  const char *type;

  if (!(complete || upper))
    return p;

  /* Module identifiers */
  if (!dummy)
  {
    for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++)
    {
      p = print_str_new_line(p, "status = clSetKernelArg(");
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "kernel[ID_");
      p = isl_printer_print_str(p, module_name);
      p = isl_printer_print_str(p, "_base");
      if (module->io_groups[0]->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, " + c0");
      }
      p = isl_printer_print_str(p, "],");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_int(p, n_arg);
      p = isl_printer_print_str(p, ",");
      p = isl_printer_end_line(p);
      n_arg++;

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "sizeof(unsigned int),");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "(void *)&c");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "CHECK(status);");
    }
  }
  else
  {
    /* Dummy modules will never be instantiated at the host code. */
  }

  /* Params */
  space = isl_union_set_get_space(module->kernel->arrays);
  n = isl_space_dim(space, isl_dim_param);
  isl_space_free(space);
  for (int i = 0; i < n; i++)
  {
    const char *name = isl_space_get_dim_name(space, isl_dim_set, i);
    p = print_str_new_line(p, "status = clSetKernelArg(");
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "kernel[ID_");
    p = isl_printer_print_str(p, module_name);
    p = isl_printer_print_str(p, "_base");
    if (module->io_groups[0]->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, " + c0");
    }
    p = isl_printer_print_str(p, "],");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_int(p, n_arg);
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    n_arg++;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "sizeof(unsigned int),");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "(void *)&");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "CHECK(status);");
  }

  /* Host iters */
  n = isl_space_dim(module->kernel->space, isl_dim_set);
  for (int i = 0; i < n; i++)
  {
    const char *name = isl_space_get_dim_name(module->kernel->space, isl_dim_set, i);
    p = print_str_new_line(p, "status = clSetKernelArg(");
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "kernel[ID_");
    p = isl_printer_print_str(p, module_name);
    p = isl_printer_print_str(p, "_base");
    if (module->io_groups[0]->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, " + c0");
    }
    p = isl_printer_print_str(p, "],");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_int(p, n_arg);
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    n_arg++;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "sizeof(unsigned int),");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "(void *)&");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "CHECK(status);");
  }

  /* Scalars and arrays */
  if (module->type != PE_MODULE && module->to_mem)
  {
    struct autosa_local_array_info *local_array = module->io_groups[0]->local_array;
    /* IO modules will not contain any scalar inputs. */
    p = print_str_new_line(p, "status = clSetKernelArg(");
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "kernel[ID_");
    p = isl_printer_print_str(p, module_name);
    p = isl_printer_print_str(p, "_base");
    if (module->io_groups[0]->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, " + c0");
    }
    p = isl_printer_print_str(p, "],");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_int(p, n_arg);
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    n_arg++;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "sizeof(cl_mem),");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "(void *)&buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "[");
    if (module->io_groups[0]->n_mem_ports == 1)
    {
      p = isl_printer_print_int(p, module->n_array_ref);
    }
    else
    {
      p = isl_printer_print_str(p, "c0 + ");
      p = isl_printer_print_int(p, module->n_array_ref);
    }
    p = isl_printer_print_str(p, "]);");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "CHECK(status);");
  }

  return p;
}

static __isl_give isl_printer *print_set_ext_module_args_stmt(
    __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options,
    __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_EXT_MODULE:
    return autosa_kernel_print_set_ext_module_args(p, stmt, data->prog);
  }

  return p;
}

static __isl_give isl_printer *autosa_kernel_print_launch_ext_module_kernels(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog)
{
  int upper = stmt->u.m.upper;
  int lower = stmt->u.m.lower;
  int complete = (upper == 0 && lower == 0);
  int dummy = stmt->u.m.dummy;
  int boundary = stmt->u.m.boundary;
  char *module_name = stmt->u.m.module_name;
  struct autosa_hw_module *module = stmt->u.m.module;
  int n_arg = 0;
  struct autosa_kernel *kernel = module->kernel;

  isl_space *space;
  int nparams;
  int n;
  const char *type;

  if (!(complete || upper))
    return p;

  p = print_str_new_line(p, "status = clEnqueueNDRangeKernel(");
  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "cmdQueue[ID_");
  p = isl_printer_print_str(p, module_name);
  p = isl_printer_print_str(p, "_base");
  if (module->io_groups[0]->n_mem_ports > 1)
  {
    p = isl_printer_print_str(p, " + c0");
  }
  p = isl_printer_print_str(p, "],");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "kernel[ID_");
  p = isl_printer_print_str(p, module_name);
  p = isl_printer_print_str(p, "_base");
  if (module->io_groups[0]->n_mem_ports > 1)
  {
    p = isl_printer_print_str(p, " + c0");
  }
  p = isl_printer_print_str(p, "],");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "1,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "globalWorkSize,");
  p = print_str_new_line(p, "localWorkSize,");
  p = print_str_new_line(p, "0,");
  p = print_str_new_line(p, "NULL,");
  p = print_str_new_line(p, "NULL);");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "CHECK(status);");

  return p;
}

static __isl_give isl_printer *print_launch_ext_module_kernels_stmt(
    __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options,
    __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_EXT_MODULE:
    return autosa_kernel_print_launch_ext_module_kernels(p, stmt, data->prog);
  }

  return p;
}

/* Set kernel arguments:
 * - arrays
 * - parameters
 * - host iterators
 * TODO: We need to filter out the module declaration trees and 
 * print them for Intel devices.
 */
static __isl_give isl_printer *print_set_kernel_arguments_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_hw_top_module *top)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = prog->ctx;
  struct print_hw_module_data hw_data = {NULL, prog, NULL, NULL};

  p = print_str_new_line(p, "// Set the arguments");
  /* Default settings */
  p = print_str_new_line(p, "size_t globalWorkSize[1];");
  p = print_str_new_line(p, "size_t localWorkSize[1];");
  p = print_str_new_line(p, "globalWorkSize[0] = 1;");
  p = print_str_new_line(p, "localWorkSize[0] = 1;");
  p = isl_printer_end_line(p);

  for (int i = 0; i < top->n_ext_module; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_set_ext_module_args_stmt, &hw_data);

    p = isl_ast_node_print(top->ext_module_wrapped_trees[i],
                           p, print_options);
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Launch the kernels.
 * For each io module connected to the external memory, we will launch a kernel
 * in a independent command queue.
 */
static __isl_give isl_printer *print_launch_kernel_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_hw_top_module *top)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = prog->ctx;
  struct print_hw_module_data hw_data = {NULL, prog, NULL, NULL};

  p = print_str_new_line(p, "// Launch the kernels");

  for (int i = 0; i < top->n_ext_module; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_launch_ext_module_kernels_stmt, &hw_data);

    p = isl_ast_node_print(top->ext_module_wrapped_trees[i],
                           p, print_options);
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user_intel(__isl_take isl_printer *p,
                                                     __isl_take isl_ast_print_options *print_options,
                                                     __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int is_user;
  struct autosa_kernel *kernel;
  struct autosa_kernel_stmt *stmt;
  struct print_host_user_data *data;
  struct hls_info *hls;
  struct autosa_hw_top_module *top;

  isl_ast_print_options_free(print_options);

  data = (struct print_host_user_data *)user;
  hls = data->hls;
  top = data->top;

  id = isl_ast_node_get_annotation(node);
  if (!id)
  {
    return print_device_node_intel(p, node, data->prog, hls->hls, top);
  }

  is_user = !strcmp(isl_id_get_name(id), "user");
  kernel = is_user ? NULL : (struct autosa_kernel *)isl_id_get_user(id);
  stmt = is_user ? (struct autosa_kernel_stmt *)isl_id_get_user(id) : NULL;
  isl_id_free(id);

  if (is_user)
    return autosa_kernel_print_domain(p, stmt);

  /* Print OpenCL host. */
  p = ppcg_start_block(p);

  p = print_set_kernel_arguments_intel(p, data->prog, kernel, top);

  p = print_str_new_line(p, "for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "status = clFinish(cmdQueue[i]); CHECK(status);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "fpga_begin = std::chrono::high_resolution_clock::now();");

  p = print_launch_kernel_intel(p, data->prog, kernel, top);

  p = print_str_new_line(p, "for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "status = clFinish(cmdQueue[i]); CHECK(status);");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = print_str_new_line(p, "fpga_end = std::chrono::high_resolution_clock::now();");

  p = ppcg_end_block(p);
  p = isl_printer_end_line(p);

  /* Print the top kernel header. */
  // print_kernel_headers_intel(data->prog, kernel, data->hls); // TODO

  return p;
}

/* Print the header of the given module.
 */
static __isl_give isl_printer *print_module_header_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary, int serialize)
{
  p = isl_printer_start_line(p);
  if (inter == -1)
    p = isl_printer_print_str(p, "__kernel void ");
  else
    p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  if (serialize)
    p = isl_printer_print_str(p, "_serialize");
  p = isl_printer_print_str(p, "(");
  p = print_module_arguments(p, prog, module->kernel, module, 1, INTEL_HW, inter, -1, boundary, serialize);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* Print the header of the given module to both gen->hls.kernel_h
 * and gen->hls.kernel_c
 * If "inter" is -1, this is a normal module call.
 * If "inter" is 0, this is a intra_trans module call.
 * If "inter" is 1, this is a inter_trans module call.
 */
static isl_stat print_module_headers_intel(
    struct autosa_prog *prog, struct autosa_hw_module *module,
    struct hls_info *hls, int inter, int boundary, int serialize)
{
  isl_printer *p;  

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  if (inter == -1)
  {
    p = print_str_new_line(p, "__attribute__((max_global_work_dim(0)))");
    //if (module->to_mem != 1)
    if ((module->is_serialized && !serialize) || (module->to_mem != 1))
      p = print_str_new_line(p, "__attribute__((autorun))");
  }
  p = print_module_header_intel(p, prog, module, inter, boundary, serialize);
  //p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print out variable declarations on Intel platforms. 
 */
static __isl_give isl_printer *print_module_var_intel(
    __isl_take isl_printer *p,
    struct autosa_kernel_var *var, int double_buffer,
    struct autosa_hw_module *module)
{
  int j;
  int use_memory = 0; // 0: FF 1: LUTRAM 2: BRAM 3: URAM
  use_memory = extract_memory_type(module, var, module->options->autosa->uram);

  p = isl_printer_start_line(p);
  if (var->n_lane == 1)
    p = isl_printer_print_str(p, var->array->type);
  else
  {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, var->n_lane);
  }
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, var->name);
  if (double_buffer)
    p = isl_printer_print_str(p, "_ping");
  for (j = 0; j < isl_vec_size(var->size); ++j)
  {
    isl_val *v;

    p = isl_printer_print_str(p, "[");
    v = isl_vec_get_element_val(var->size, j);
    p = isl_printer_print_val(p, v);
    isl_val_free(v);
    p = isl_printer_print_str(p, "]");
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  /* Print pong buffer */
  if (double_buffer)
  {
    p = isl_printer_start_line(p);
    if (var->n_lane == 1)
      p = isl_printer_print_str(p, var->array->type);
    else
    {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_pong");
    for (j = 0; j < isl_vec_size(var->size); ++j)
    {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  return p;
}

static __isl_give isl_printer *print_module_vars_intel(__isl_take isl_printer *p,
                                                       struct autosa_hw_module *module, int inter)
{
  int i, n;
  isl_space *space;
  const char *type;

  if (inter == -1)
  {
    for (i = 0; i < module->n_var; ++i)
      p = print_module_var_intel(p, &module->var[i], module->double_buffer, module);
  }

  if (module->double_buffer && inter == -1)
  {
    type = isl_options_get_ast_iterator_type(module->kernel->ctx);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "bool arb = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
    p = isl_printer_end_line(p);
    /* iterators */
    space = (module->in) ? module->intra_space : module->inter_space;
    n = isl_space_dim(space, isl_dim_set);
    for (int i = 0; i < n; i++)
    {
      const char *name;
      name = isl_space_get_dim_name(space, isl_dim_set, i);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, "_prev");
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

/* Print the intra_trans module.
 */
static __isl_give isl_printer *autosa_print_intra_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_intel(prog, module, hls, 0, boundary, 0);
  fprintf(hls->kernel_c, " {\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = print_module_iterators(p, hls->kernel_c, module);
  p = print_module_vars_intel(p, module, 0);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!intra_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_module_for, &hw_data);

  //p = print_str_new_line(p, "#pragma loop_coalesce");
  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

/* Print the inter_trans module.
 */
static __isl_give isl_printer *autosa_print_inter_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (boundary) {
    if (!module->boundary_inter_tree)
      return p;
  } else {
    if (!module->inter_tree)
      return p;
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_intel(prog, module, hls, 1, boundary, 0);
  fprintf(hls->kernel_c, " {\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = print_module_iterators(p, hls->kernel_c, module);
  p = print_module_vars_intel(p, module, 1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!inter_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_module_for, &hw_data);

  //p = print_str_new_line(p, "#pragma loop_coalesce");
  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

/* Print the serializaztion module that connects the external memory to the 
 * top-level I/O module. 
 */
static __isl_give isl_printer *autosa_print_serialize_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{  
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);  

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_intel(prog, module, hls, -1, boundary, 1);  
  fprintf(hls->kernel_c, " {\n");    
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);    
  }
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  p = print_module_serialize_body(p, module, hls);
  p = isl_printer_indent(p, -2);
  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);
  return p;
}

/* Print the default module. */
static __isl_give isl_printer *autosa_print_default_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;
  } else {
    if (!module->boundary_tree)
      return p;
  }  

  bool wrapper = 0;
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print wrapper for PE and L1 IO module */
  if (module->type == PE_MODULE || (module->type != PE_MODULE && module->level == 1)) 
    wrapper = 1; 

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  //p = print_module_core_headers_intel(p, prog, module, hls, -1, boundary, 1);
  print_module_headers_intel(prog, module, hls, -1, boundary, 0);
  fprintf(hls->kernel_c, " {\n");  
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_module_vars_intel(p, module, -1);  
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->credit && !module->in)
  {
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_module_for, &hw_data);

  //p = print_str_new_line(p, "#pragma loop_coalesce");
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);

  if (module->credit && module->in)
  {
  }

  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  /* Print wrapper. */
  //  if (hls->target == XILINX_HW) {
  //    p = isl_printer_start_line(p);
  //    p = isl_printer_print_str(p, "/* Module Definition */");
  //    p = isl_printer_end_line(p);
  //
  //    print_module_wrapper_headers_xilinx(prog, module, hls, -1, boundary);
  //
  //    fprintf(hls->kernel_c, "{\n");
  //    p = isl_printer_indent(p, 2);
  //
  //    p = print_module_core_headers_xilinx(p, prog, module, hls, -1, boundary, 0);
  //    p = isl_printer_print_str(p, ";");
  //    p = isl_printer_end_line(p);
  //    p = isl_printer_indent(p, -2);
  //
  //    fprintf(hls->kernel_c, "}\n");
  //    p = isl_printer_start_line(p);
  //    p = isl_printer_print_str(p, "/* Module Definition */");
  //    p = isl_printer_end_line(p);
  //
  //    p = isl_printer_end_line(p);
  //  }

  /* If the module serialization is enabled, we will print out an extra module
   * for serailizing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_header_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module, int types)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  if (types)
    p = isl_printer_print_str(p, "void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in" : "_out");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, types, INTEL_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_headers_intel(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_pe_dummy_module *module, struct hls_info *hls, int types)
{
  p = print_pe_dummy_module_core_header_intel(p, prog, module, types);

  return p;
}

/* Print the header of the given module.
 */
static __isl_give isl_printer *print_pe_dummy_module_header_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module,
    int inter, int boundary)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "__kernel void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in" : "_out");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, 1, INTEL_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* Print the header of the given module to both gen->hls.kernel_h
 * and gen->hls.kernel_c
 * If "inter" is -1, this is a normal module call.
 * If "inter" is 0, this is a intra_trans module call.
 * If "inter" is 1, this is a inter_trans module call.
 */
static isl_stat print_pe_dummy_module_headers_intel(
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module,
    struct hls_info *hls, int inter, int boundary)
{
  isl_printer *p;

  //  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  //  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  //  p = print_pe_dummy_module_header_intel(p, prog, module, inter, boundary);
  //  p = isl_printer_print_str(p, ";");
  //  p = isl_printer_end_line(p);
  //  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "__attribute__((max_global_work_dim(0)))");
  p = print_str_new_line(p, "__attribute__((autorun))");
  p = print_pe_dummy_module_header_intel(p, prog, module, inter, boundary);
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *autosa_print_default_pe_dummy_module(
    __isl_take isl_printer *p,
    struct autosa_pe_dummy_module *pe_dummy_module,
    struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  struct autosa_hw_module *module = pe_dummy_module->module;
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  //if (hls->target == XILINX_HW)
  //    p = print_pe_dummy_module_core_headers_xilinx(p, prog,
  //pe_dummy_module, hls, 1);
  print_pe_dummy_module_headers_intel(prog, pe_dummy_module, hls, -1, boundary);

  fprintf(hls->kernel_c, " {\n");
  p = isl_printer_indent(p, 2);  
  p = print_str_new_line(p, "while (1) {");
  p = isl_printer_indent(p, 2);
  
  /* [type] fifo_data; */
  struct autosa_array_ref_group *group = pe_dummy_module->io_group;
  int n_lane = get_io_group_n_lane(NULL, pe_dummy_module, group);
  p = isl_printer_start_line(p);
  if (n_lane == 1) {
    p = isl_printer_print_str(p, group->array->type);
  } else {
    p = isl_printer_print_str(p, group->array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  }
  p = isl_printer_print_str(p, " fifo_data;");
  p = isl_printer_end_line(p);

  /* fifo_data = fifo.read(); */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fifo_data = read_channel_intel(");
  p = autosa_array_ref_group_print_fifo_name(group, p);
  p = isl_printer_print_str(p, "_in);");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

struct print_db_module_intel_data {
  int inter; // -1: outer 0: intra 1: inter  
  int under_if; 
  int reach_user;

  isl_printer *p_for;
  isl_printer *p_user;
  /* Outer */
  std::vector<char *> outer_for_logic;  
  std::vector<char *> outer_iterator_name;
  std::vector<char *> outer_iterator_lb;
  std::vector<char *> outer_iterator_ub;
  int outer_for_level;
  /* Inter */
  std::vector<char *> inter_for_logic;  
  std::vector<char *> inter_iterator_name;
  std::vector<char *> inter_iterator_lb;
  std::vector<char *> inter_iterator_ub;
  int inter_for_level;
  /* Intra */
  std::vector<char *> intra_for_logic;  
  std::vector<char *> intra_iterator_name;
  std::vector<char *> intra_iterator_lb;
  std::vector<char *> intra_iterator_ub;
  int intra_for_level;
};

static __isl_give isl_printer *print_double_buffer_module_vars_intel(
  __isl_take isl_printer *p, struct autosa_hw_module *module, struct hls_info *hls,
  struct print_db_module_intel_data *data)
{
  /* Inst ids */
  p = print_module_iterators(p, hls->kernel_c, module);
  /* Local buffer */
  for (int i = 0; i < module->n_var; i++) {
    struct autosa_kernel_var *var = &module->var[i];
    p = isl_printer_start_line(p);
    if (var->n_lane == 1) 
      p = isl_printer_print_str(p, var->array->type);
    else
    {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    p = isl_printer_print_str(p, "[2]");
    for (int j = 0; j < isl_vec_size(var->size); j++) {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");      
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  /* State handle variables */
  p = print_str_new_line(p, "bool arb = 0;");  
  p = print_str_new_line(p, module->in? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
  p = print_str_new_line(p, module->in? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
  p = print_str_new_line(p, module->in? "bool inter_done = 0;" : "bool inter_done = 1;");
  p = print_str_new_line(p, module->in? "bool intra_done = 1;" : "bool intra_done = 0;");
  /* Iterators */
  for (int i = 0; i < data->outer_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->outer_iterator_name[i]);
    free(data->outer_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->outer_iterator_lb[i]);
    free(data->outer_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->outer_iterator_ub[i]);
    free(data->outer_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->inter_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->inter_iterator_name[i]);
    free(data->inter_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->inter_iterator_lb[i]);
    free(data->inter_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->inter_iterator_ub[i]);
    free(data->inter_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->intra_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->intra_iterator_name[i]);
    free(data->intra_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->intra_iterator_lb[i]);
    free(data->intra_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->intra_iterator_ub[i]);
    free(data->intra_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Count the for level.
 */
static __isl_give isl_printer *count_module_for(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_intel_data *data = (struct print_db_module_intel_data *)user;
  isl_ast_node *body;

  if (data->inter == -1)
    data->outer_for_level++;
  else if (data->inter == 0)
    data->intra_for_level++;
  else if (data->inter == 1)
    data->inter_for_level++;

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}                                                                                                

/* Count the for level. A different implementation. 
 * Currently only used for inter_trans module.
 * Since there might be if branches existing, only count one branch.
 * We assume the two branches are with the equal depth.
 */
static isl_bool count_module_for_alt(__isl_keep isl_ast_node *node, void *user) {
  struct print_db_module_intel_data *data = (struct print_db_module_intel_data *)user;
  if (isl_ast_node_get_type(node) == isl_ast_node_if) {
    data->under_if = 1;
  }  

  if (isl_ast_node_get_type(node) == isl_ast_node_for) {
    if (data->under_if == 0 || (data->under_if == 1 && data->reach_user == 0)) {
      data->inter_for_level++;    
    }
  }
  if (isl_ast_node_get_type(node) == isl_ast_node_user) {
    data->reach_user = 1;
  }

  return isl_bool_true;
}

/* Extract the loop information. 
 */
static __isl_give isl_printer *extract_module_for(__isl_take isl_printer *p,
                                                  __isl_take isl_ast_print_options *print_options,
                                                  __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_intel_data *data = (struct print_db_module_intel_data *)user;
  isl_ast_expr *iterator, *init, *cond, *ub;  
  const char *iterator_suffix;
  isl_printer *p_local, *p_str;  
  char *text, *iter_str;
  std::vector<char *> text_lines;
  isl_ast_node *body;
  int iter_exist = 0;

  p_local = data->p_for;  

  /* Extract the lower bound and upper bound. */
  iterator = isl_ast_node_for_get_iterator(node);
  init = isl_ast_node_for_get_init(node);
  cond = isl_ast_node_for_get_cond(node);
  ub = isl_ast_expr_op_get_arg(cond, 1);

  p_str = isl_printer_to_str(isl_ast_node_get_ctx(node));
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);  
  p_str = isl_printer_print_ast_expr(p_str, iterator);
  iter_str = isl_printer_get_str(p_str);
  if (data->inter == -1) {    
  } else if (data->inter == 0) {    
  } else if (data->inter == 1) {
    for (int i = 0; i < data->inter_iterator_name.size(); i++) {
      if (!strcmp(data->inter_iterator_name[i], iter_str))
        iter_exist = 1;
    }    
  }  
  free(iter_str);

  if (iter_exist) {
    isl_printer_free(p_str);

    isl_ast_expr_free(iterator);
    isl_ast_expr_free(init);
    isl_ast_expr_free(cond);
    isl_ast_expr_free(ub);

    body = isl_ast_node_for_get_body(node);
    p = isl_ast_node_print(body, p, print_options);
    isl_ast_node_free(body);

    return p;
  }

  if (data->inter == -1)
    data->outer_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_name.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, ub);
  if (data->inter == -1)
    data->outer_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_ub.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, init);
  if (data->inter == -1)
    data->outer_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_lb.push_back(isl_printer_get_str(p_str));
  isl_printer_free(p_str);

  p_local = isl_printer_indent(p_local, -2);

  p_local = isl_printer_start_line(p_local);    
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, "++;");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_start_line(p_local);
  p_local = isl_printer_print_str(p_local, "if (");  
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " == "); 
  p_local = isl_printer_print_ast_expr(p_local, ub);
  p_local = isl_printer_print_str(p_local, " + 1) {"); 
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_indent(p_local, 2);
  p_local = isl_printer_start_line(p_local);    
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " = ");
  p_local = isl_printer_print_ast_expr(p_local, init);
  p_local = isl_printer_print_str(p_local, ";");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  if (data->inter == -1)
    data->outer_for_logic.insert(data->outer_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 0)
    data->intra_for_logic.insert(data->intra_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 1)
    data->inter_for_logic.insert(data->inter_for_logic.begin(), text_lines.begin(), text_lines.end());

  isl_ast_expr_free(iterator);
  isl_ast_expr_free(init);
  isl_ast_expr_free(cond);
  isl_ast_expr_free(ub);

  p_local = isl_printer_indent(p_local, -2);

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}                                                                                           

static void extract_double_buffer_module_intel_data(
  struct autosa_hw_module *module, int boundary, 
  struct print_db_module_intel_data *data)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = module->kernel->ctx;
  isl_printer *p_for, *p_user, *p;
  const char *for_logic, *user_logic;

  /* Outer module */
  data->inter = -1;  
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->outer_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);

  /* Extract the for and user logic. */
  data->p_for = isl_printer_indent(data->p_for, 2 * data->outer_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Intra module */
  data->inter = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->intra_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);  
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  p = isl_ast_node_print(module->intra_tree, p, print_options);  

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 2 * data->intra_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);  
  p = isl_ast_node_print(module->intra_tree, p, print_options);  
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Inter module */
  data->inter = 1;
  data->under_if = 0;
  data->reach_user = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;  
  data->inter_for_level = 0;

  /* Count the for level first. */  
  if (!boundary) {
    isl_ast_node_foreach_descendant_top_down(module->inter_tree, &count_module_for_alt, data);
  } else {        
    isl_ast_node_foreach_descendant_top_down(module->boundary_inter_tree, &count_module_for_alt, data);    
  }

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 2 * data->inter_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->inter_tree, p, print_options);
  else {    
    p = isl_ast_node_print(module->boundary_inter_tree, p, print_options);    
  }
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);
}

static __isl_give isl_printer *print_null_for(__isl_take isl_printer *p,
                                              __isl_take isl_ast_print_options *print_options,
                                              __isl_keep isl_ast_node *node, void *user)
{
  isl_ast_node *body;
  
  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}                                              

/* Print the inter_trans module in double buffer mode. 
 */
static __isl_give isl_printer *autosa_print_inter_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  //printf("here\n");
  struct print_hw_module_data hw_data = {hls, prog, module, "inter_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  //if (boundary == 1)
  //  DBGASTNODE(stdout, module->boundary_inter_tree, ctx);
  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Print the intra_trans module in double buffer mode. 
 */
static __isl_give isl_printer *autosa_print_intra_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, "intra_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Double buffer module on Intel devices needs to be handled specially.
 * First, we will change the buffer to 
 * local_buffer[2][...][...].
 * Intel OpenCL compiler can't handle local_buffer_ping/local_buffer_pong properly.
 * Specifically, when handling a code structure:
 * [outer for loops]
 * for ...
 *   for ...
 * [outer for loops]
 * { 
 *   if (arb) {
 *     ld(local_buffer_ping, ld_en);
 *     st(local_buffer_pong, st_en);
 *   else {
 *     ld(local_buffer_pong, ld_en);
 *     st(local_buffer_ping, st_en);
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   [state handle logic]
 * }
 * [last batch]
 * if (arb) {
 *   st(local_buffer_pong, st_en);
 * } else {
 *   st(local_buffer_ping, st_en);
 * }
 * [last batch]
 * We will convert it to a new code structure:
 * while (1) {
 *   if (ld_en) {
 *     [inlined logic]
 *     ld(local_buffer[arb][...]);
 *     [inlined logic]
 *   } 
 *   if (st_en) {
 *     [inlined logic]
 *     st(local_buffer[!arb][...]);
 *     [inlined logic]
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   ld_en = 1;
 *   st_en = 1;
 *   [state handle logic]
 *   [outer for loops]
 *   outer_iter0++;
 *   if (outer_iter0 == ...) {
 *     outer_iter0 = 0;
 *     [last batch]
 *     ld_en = 0;
 *     [last batch]
 *   }
 *   [outer for loops]
 * }
 * 
 * Note that this only works if each for loop structure is a perfectly 
 * nested loop so that we could convert to a while loop.
 */
static __isl_give isl_printer *print_double_buffer_module_while(
  __isl_take isl_printer *p, struct autosa_hw_module *module,
  struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;    
  } else {
    if (!module->boundary_tree)
      return p;
  }

  struct print_db_module_intel_data print_data;

  /* Extract the code snippets. */
  extract_double_buffer_module_intel_data(module, boundary, &print_data);

  /* Print header */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_intel(prog, module, hls, -1, boundary, 0);
  p = print_str_new_line(p, " {");
  p = isl_printer_indent(p, 2);

  /* Print variables */
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = print_double_buffer_module_vars_intel(p, module, hls, &print_data);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  /* Print content */
  p = print_str_new_line(p, "while (1) {");
  p = isl_printer_indent(p, 2);
  
  /* Print inter_trans */
  p = print_str_new_line(p, "if (inter_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_inter_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */  
  for (int i = 0; i < print_data.inter_for_logic.size(); i++) {    
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.inter_for_logic[i]);
    free(print_data.inter_for_logic[i]);
  }
  p = isl_printer_indent(p, 2 * print_data.inter_for_level);
  p = print_str_new_line(p, "inter_done = 1;");
  p = print_str_new_line(p, "inter_trans_en = 0;");
  for (int i = 0; i < print_data.inter_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print intra_trans */
  p = print_str_new_line(p, "if (intra_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_intra_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */
  for (int i = 0; i < print_data.intra_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.intra_for_logic[i]);
    free(print_data.intra_for_logic[i]);
  }
  p = isl_printer_indent(p, 2 * print_data.intra_for_level);
  p = print_str_new_line(p, "intra_done = 1;");
  p = print_str_new_line(p, "intra_trans_en = 0;");
  for (int i = 0; i < print_data.intra_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print state_handle */
  p = print_str_new_line(p, "if (inter_done && intra_done) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "intra_trans_en = 1;");
  p = print_str_new_line(p, "inter_trans_en = 1;");
  p = print_str_new_line(p, "intra_done = 0;");
  p = print_str_new_line(p, "inter_done = 0;");
  p = print_str_new_line(p, "arb = !arb;");
  /* Print the loop counter */
  for (int i = 0; i < print_data.outer_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.outer_for_logic[i]);
    free(print_data.outer_for_logic[i]);
  }
  p = isl_printer_indent(p, 2 * print_data.outer_for_level);
  p = print_str_new_line(p, module->in? "inter_trans_en = 0;" : "intra_trans_en = 0;");
  for (int i = 0; i < print_data.outer_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *autosa_print_host_code(__isl_take isl_printer *p,
                                                      struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                                      struct autosa_hw_module **modules, int n_modules,
                                                      struct autosa_hw_top_module *top,
                                                      struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                      struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(tree);
  struct print_host_user_data data = {hls, prog, top};
  struct print_hw_module_data hw_data = {hls, prog, NULL, NULL};
  isl_printer *p_module;

  /* Print the data pack types in the program. */
  print_data_types_intel(top, hls);

  /* Print the helper functions in the program. */
  print_drain_merge_funcs(top->kernel, drain_merge_funcs, n_drain_merge_funcs, hls);

  /* Print the host data serialization function. */
  print_host_serialize_funcs(top->kernel, modules, n_modules, hls);

  /* Print the default AST. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_host_user_intel, &data);

  /* Print the macros definitions in the program. */
  p = autosa_print_macros(p, tree);
  p = isl_ast_node_print(tree, p, print_options);

  /* Print the hw module ASTs. */
  p_module = isl_printer_to_file(ctx, hls->kernel_c);
  p_module = isl_printer_set_output_format(p_module, ISL_FORMAT_C);

  for (int i = 0; i < n_modules; i++)
  {   
    //std::cout << modules[i]->name << " " << module->device_tree << std::endl;
    if (modules[i]->double_buffer && modules[i]->options->autosa->double_buffer_style == 0) {
      /* We implement a different codegen for double buffer on Intel devices. */
      p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 0);
      if (modules[i]->boundary) {
        p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 1);
      }
    } else {
      if (modules[i]->is_filter && modules[i]->is_buffer)
      {
        /* Print out the definitions for inter_trans and intra_trans function calls. */
        /* Intra transfer function */
        p_module = autosa_print_intra_trans_module(p_module, modules[i], prog, hls, 0);
  
        /* Inter transfer function */
        p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 0);
        if (modules[i]->boundary)
          p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 1);
      }
  
      p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 0);
  
      if (modules[i]->boundary)
      {
        /* Print out the definitions for boundary trans function calls. */
        p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 1);
      }
      if (modules[i]->n_pe_dummy_modules > 0)
      {
        /* Print out the definitions for pe dummy function calls. */
        for (int j = 0; j < modules[i]->n_pe_dummy_modules; j++)
        {
          p_module = autosa_print_default_pe_dummy_module(
              p_module, modules[i]->pe_dummy_modules[j], prog, hls, 0);
        }
      }
    }
  }
  isl_printer_free(p_module);

  return p;
}

static __isl_give isl_printer *print_top_module_headers_intel(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls)
{
  struct autosa_kernel *kernel = top->kernel;

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"void kernel");
  p = isl_printer_print_int(p, top->kernel->id);
  p = isl_printer_print_str(p, "(");
  p = print_kernel_arguments(p, prog, top->kernel, 1, hls);
  p = isl_printer_print_str(p, ")\");");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"{\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  return p;
}

static char *extract_fifo_name_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static char *extract_fifo_width_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    loc++;
  }

  loc++;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static __isl_give isl_printer *print_top_module_fifo_stmt(__isl_take isl_printer *p,
                                                          __isl_take isl_ast_print_options *print_options,
                                                          __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_FIFO_DECL:
    return autosa_kernel_print_fifo_decl(p, stmt, data->prog, data->hls);
  }

  return p;
}

static __isl_give isl_printer *print_top_module_call_stmt(
    __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options,
    __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_MODULE_CALL:
    return autosa_kernel_print_module_call(p, stmt, data->prog, data->hls->target);
  }

  return p;
}

/* This function prints the code that prints out the top function that 
 * calls the hardware modules and declares the fifos.
 */
static void print_top_gen_host_code(
    struct autosa_prog *prog, __isl_keep isl_ast_node *node,
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  isl_printer *p;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;
  struct print_hw_module_data hw_data = {hls, prog, NULL, NULL};

  /* Print the top module ASTs. */
  p = isl_printer_to_file(ctx, hls->top_gen_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);

  print_top_gen_headers(prog, top, hls);
  fprintf(hls->top_gen_c, " {\n");
  p = isl_printer_indent(p, 2);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *fd = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/resource_est/design_info.dat\", \"w\");");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int fifo_cnt;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx *ctx = isl_ctx_alloc();");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer *p = isl_printer_to_file(ctx, f);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  p = print_top_module_headers_intel(p, prog, top, hls); // TODO
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, 2);");
  p = isl_printer_end_line(p);

  /* Print FIFO declarations */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* Print the serialize fifos if existing. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    struct autosa_array_ref_group *group = module->io_groups[0];
    if (module->is_serialized) {
      /* Generate fifo decl counter. */
      char *fifo_name;
      int fifo_w;  // bytes
      fifo_w = module->data_pack_inter * group->array->size;
      isl_printer *p_str;
      p_str = isl_printer_to_str(ctx);
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      p_str = isl_printer_print_str(p_str, "_");
      p_str = isl_printer_print_str(p_str, module->name);
      p_str = isl_printer_print_str(p_str, "_serialize");
      fifo_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      p = print_str_new_line(p, "fifo_cnt = 1;");
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* ");
      p = isl_printer_print_str(p, module->name);
      p = isl_printer_print_str(p, "_serialize fifo */ ");      
      p = print_fifo_type_intel(p, group, module->data_pack_inter);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, fifo_name);            
      p = isl_printer_print_str(p, "\");");      
      p = isl_printer_end_line(p);      

      /* Resource pragma */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \" __attribute__((depth(");
      p = isl_printer_print_int(p, fifo_depth);
      p = isl_printer_print_str(p, ")));\");");
      p = isl_printer_end_line(p);

      p = print_str_new_line(p, "p = isl_printer_end_line(p);");

      /* fifo:fifo_name:fifo_cnt:fifo_width */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, ":\%d:");
      p = isl_printer_print_int(p, fifo_w);
      p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
      p = isl_printer_end_line(p);

      p = isl_printer_end_line(p);      
      free(fifo_name);
    }
  }

  for (int i = 0; i < top->n_fifo_decls; i++)
  {
    /* Generate fifo decl counter. */
    char *fifo_decl_name = top->fifo_decl_names[i];
    char *fifo_name = extract_fifo_name_from_fifo_decl_name(ctx, fifo_decl_name);
    char *fifo_w = extract_fifo_width_from_fifo_decl_name(ctx, fifo_decl_name);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fifo_cnt = 0;");
    p = isl_printer_end_line(p);

    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_fifo_stmt, &hw_data);

    p = isl_ast_node_print(top->fifo_decl_wrapped_trees[i],
                           p, print_options);

    /* fifo:fifo_name:fifo_cnt:fifo_width */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ":\%d:");
    p = isl_printer_print_str(p, fifo_w);
    p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);

    free(fifo_name);
    free(fifo_w);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  int n_module_names = 0;
  char **module_names = NULL;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    /* Generate module call counter. */
    struct autosa_hw_module *module = top->hw_modules[i];
    char *module_name;

    if (module->is_filter && module->is_buffer)
    {
      module_name = concat(ctx, module->name, "intra_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      module_name = concat(ctx, module->name, "inter_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      if (module->boundary)
      {
        module_name = concat(ctx, module->name, "inter_trans_boundary");

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    module_name = strdup(module->name);

    n_module_names++;
    module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
    module_names[n_module_names - 1] = module_name;

    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "boundary");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }

    if (module->n_pe_dummy_modules > 0)
    {
      for (int j = 0; j < module->n_pe_dummy_modules; j++)
      {
        struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[j];
        struct autosa_array_ref_group *group = dummy_module->io_group;
        isl_printer *p_str = isl_printer_to_str(ctx);
        p_str = autosa_array_ref_group_print_prefix(group, p_str);
        p_str = isl_printer_print_str(p_str, "_PE_dummy");
        p_str = isl_printer_print_str(p_str, dummy_module->in? "_in" : "_out");
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    if (module->is_serialized) { 
      if (module->boundary)      
        module_name = concat(ctx, module->name, "boundary_serialize");
      else
        module_name = concat(ctx, module->name, "serialize");
      
      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }
  }
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt = 0;");
    p = isl_printer_end_line(p);
  }

  /* Print module calls. */
  for (int i = 0; i < top->n_module_calls; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_call_stmt, &hw_data);

    p = isl_ast_node_print(top->module_call_wrapped_trees[i],
                           p, print_options);
  }

  /* module:module_name:module_cnt. */
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"module:");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, ":\%d\\n\", ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt);");
    p = isl_printer_end_line(p);
  }
  p = isl_printer_end_line(p);

  for (int i = 0; i < n_module_names; i++)
  {
    free(module_names[i]);
  }
  free(module_names);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  if (hls->target == XILINX_HW)
  {
    if (!hls->hls)
    {
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");
    }
  }

  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fclose(fd);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer_free(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx_free(ctx);");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "}");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* For internal testing only. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int main()");
  p = isl_printer_end_line(p);

  p = ppcg_start_block(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *f = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/src/top.cpp\", \"w\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "top_generate(f);");
  p = isl_printer_end_line(p);

  p = ppcg_end_block(p);
  p = isl_printer_free(p);

  return;
}

/* Examine if all autorun modules are legal to be used as autorun.
 * Specifically, for Intel OpenCL, we examine for each non external module 
 * (modules that are not connected to the external memory), if there is only
 * index and fifos in the arguments.
 */
static int is_autorun_legal(struct autosa_prog *prog,
                            struct autosa_hw_module **modules, int n_modules)
{
  for (int i = 0; i < n_modules; i++)
  {
    struct autosa_hw_module *module = modules[i];
    if (module->to_mem)
      continue;

    isl_space *space;
    int nparam, n;

    /* param */
    space = isl_union_set_get_space(module->kernel->arrays);
    nparam = isl_space_dim(space, isl_dim_param);
    isl_space_free(space);
    if (nparam > 0)
      return 0;
    /* host iter */
    n = isl_space_dim(module->space, isl_dim_set);
    if (n > 0)
      return 0;
    /* scalar */
    if (module->type == PE_MODULE)
    {
      for (int i = 0; i < prog->n_array; i++)
      {
        int required;
        required = autosa_kernel_requires_array_argument(module->kernel, i);
        if (required)
        {
          if (autosa_array_is_read_only_scalar(&prog->array[i]))
            return 0;
        }
      }
    }
  }

  return 1;
}

/* Given a autosa_prog "prog" and the corresponding tranformed AST
 * "tree", print the entire OpenCL/HLS code to "p".
 * "types" collects the types for which a definition has already been
 * printed.
 */
static __isl_give isl_printer *print_hw(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
    struct autosa_hw_module **modules, int n_modules,
    struct autosa_hw_top_module *top_module,
    struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
    struct autosa_types *types, void *user)
{
  struct hls_info *hls = (struct hls_info *)user;
  isl_printer *kernel;
  int legal;

  kernel = isl_printer_to_file(isl_printer_get_ctx(p), hls->kernel_c);
  kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
  kernel = autosa_print_types(kernel, types, prog);
  isl_printer_free(kernel);

  if (!kernel)
    return isl_printer_free(p);

  /* Examine if autorun kernels are legal. */
  legal = is_autorun_legal(prog, modules, n_modules);
  if (!legal)
  {
    printf("[AutoSA] Error: Autorun kernels not legal! Abort the code generation.\n");
    return p;
  }

  /* Print OpenCL host and kernel function. */
  p = autosa_print_host_code(p, prog, tree, modules, n_modules, top_module,
                             drain_merge_funcs, n_drain_merge_funcs, hls);
  /* Print seperate top module code generation function. */
  print_top_gen_host_code(prog, tree, top_module, hls);

  return p;
}

/* Generate systolic array on Intel FPGAs.
 */
int generate_autosa_intel_opencl(isl_ctx *ctx, struct ppcg_options *options,
                                 const char *input)
{
  struct hls_info hls;
  int r;

  hls.target = INTEL_HW;
  hls.hls = 0;
  hls.ctx = ctx;
  hls.output_dir = options->autosa->output_dir;
  hls.hcl = options->autosa->hcl;
  opencl_open_files(&hls, input);

  r = generate_sa(ctx, input, hls.host_c, options, &print_hw, &hls);

  opencl_close_files(&hls);

  return r;
}


================================================
FILE: src/autosa_intel_opencl.h
================================================
#ifndef _AUTOSA_INTEL_OPENCL_H
#define _AUTOSA_INTEL_OPENCL_H

#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C" {
#endif

int generate_autosa_intel_opencl(isl_ctx *ctx, struct ppcg_options *options,
	const char *input);

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: src/autosa_print.cpp
================================================
/* Helper functions in codegen */
#include <assert.h>
#include <cmath>

#include "autosa_print.h"
#include "autosa_utils.h"
#include "autosa_comm.h"
#include "print.h"

const char *vector_index[] = {"0", "1", "2", "3", "4", "5", "6", "7",
                              "8", "9", "a", "b", "c", "d", "e", "f"};

enum IO_TRANS_DIR {GLOBAL_BUF, LOCAL_BUF, FIFO};

/* Print the call of an array argument.
 */
__isl_give isl_printer *autosa_array_info_print_call_argument(
  __isl_take isl_printer *p, struct autosa_array_info *array, int n_ref, const char *prefix)
{
  if (autosa_array_is_read_only_scalar(array))
    return isl_printer_print_str(p, array->name);

  if (strlen(prefix) > 0) {
    p = isl_printer_print_str(p, prefix);
    p = isl_printer_print_str(p, "_");
  }  
  p = isl_printer_print_str(p, array->name);
  if (n_ref >= 0)
  {    
    //auto ref_port_map = array->local_array->group_ref_mem_port_map.at(n_ref);
    p = isl_printer_print_str(p, "[");
    //p = isl_printer_print_int(p, ref_port_map.second);    
    p = isl_printer_print_int(p, array->local_array->group_ref_mem_port_map.at(n_ref * 2 + 1));
    p = isl_printer_print_str(p, "]");
  }

  return p;
}

/* Print the array group name prefix.
 * [array_name]_[group_id](optional)_[drain](optional)
 */
__isl_give isl_printer *autosa_array_ref_group_print_prefix(
    struct autosa_array_ref_group *group, __isl_take isl_printer *p)
{
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_drain");
  }
  else
  {
    if (group->group_type == AUTOSA_IO_GROUP && group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
    else if (group->group_type == AUTOSA_PE_GROUP && group->local_array->n_pe_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }

  return p;
}

/* Print the name of the local copy of a given group of array references.
 */
__isl_give isl_printer *autosa_array_ref_group_print_fifo_name(
    struct autosa_array_ref_group *group, __isl_take isl_printer *p)
{
  int global = 0;
  enum autosa_group_access_type type;

  if (group->group_type == AUTOSA_PE_GROUP)
    return p;
  
  p = isl_printer_print_str(p, "fifo_");
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP) {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  } else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_drain");
  }

  return p;
}

/* Was the definition of "type" printed before?
 * That is, does its name appear in the list of printed types "types"?
 */
static int already_printed(struct autosa_types *types,
                           struct pet_type *type)
{
  int i;

  for (i = 0; i < types->n; ++i)
    if (!strcmp(types->name[i], type->name))
      return 1;

  return 0;
}

/* Print the definitions of all types prog->scop that have not been
 * printed before (according to "types") on "p".
 * Extend the list of printed types "types" with the newly printed types.
 */
__isl_give isl_printer *autosa_print_types(__isl_take isl_printer *p,
                                           struct autosa_types *types, struct autosa_prog *prog)
{
  int i, n;
  isl_ctx *ctx;
  char **name;

  n = prog->scop->pet->n_type;

  if (n == 0)
    return p;

  ctx = isl_printer_get_ctx(p);
  name = isl_realloc_array(ctx, types->name, char *, types->n + n);
  if (!name)
    return isl_printer_free(p);
  types->name = name;

  for (i = 0; i < n; ++i)
  {
    struct pet_type *type = prog->scop->pet->types[i];

    if (already_printed(types, type))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, type->definition);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    types->name[types->n++] = strdup(type->name);
  }

  return p;
}

/* Print declarations to "p" for arrays that are local to "prog"
 * but that are used on the host and therefore require a declaration.
 */
__isl_give isl_printer *autosa_print_local_declarations(
    __isl_take isl_printer *p, struct autosa_prog *prog)
{
  int i;

  if (!prog)
    return isl_printer_free(p);

  for (i = 0; i < prog->n_array; ++i)
  {
    struct autosa_array_info *array = &prog->array[i];
    isl_ast_expr *size;

    if (!array->declare_local)
      continue;
    size = array->declared_size;
    p = ppcg_print_declaration_with_size(p, array->type, size);
  }

  return p;
}

__isl_give isl_printer *print_str_new_line(__isl_take isl_printer *p, const char *str)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, str);
  p = isl_printer_end_line(p);

  return p;
}

/* Print an expression for the size of "array" in data items.
 */
__isl_give isl_printer *autosa_array_info_print_data_size(
    __isl_take isl_printer *p, struct autosa_array_info *array)
{
  int i;
  int first = 1;

  for (i = 0; i < array->n_index; ++i)
  {
    if (!first)
      p = isl_printer_print_str(p, " * ");

    isl_ast_expr *bound;

    p = isl_printer_print_str(p, "(");
    bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
    p = isl_printer_print_ast_expr(p, bound);
    isl_ast_expr_free(bound);
    p = isl_printer_print_str(p, ")");
    first = 0;
  }

  if (array->local_array->is_sparse) {
    p = isl_printer_print_str(p, " / ");
    p = isl_printer_print_double(p, (double)array->local_array->eff_compress_ratio);
  }

  return p;
}

/* Print an expression for the size of "array" in bytes.
 */
__isl_give isl_printer *autosa_array_info_print_size(
    __isl_take isl_printer *p, struct autosa_array_info *array)
{
  int i;

  for (i = 0; i < array->n_index; ++i)
  {
    isl_ast_expr *bound;

    p = isl_printer_print_str(p, "(");
    bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
    p = isl_printer_print_ast_expr(p, bound);
    isl_ast_expr_free(bound);
    p = isl_printer_print_str(p, ") * ");
  }
  p = isl_printer_print_str(p, "sizeof(");
  p = isl_printer_print_str(p, array->type);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* Print an expression for the size of "array" in bytes.
 */
__isl_give isl_printer *autosa_array_info_print_serialize_data_size(
    __isl_take isl_printer *p, struct autosa_array_info *array)
{  
  p = isl_printer_print_pw_qpolynomial(p, array->local_array->serialize_bound);
  if (array->local_array->is_sparse) {
    p = isl_printer_print_str(p, " / ");
    p = isl_printer_print_double(p, (double)array->local_array->eff_compress_ratio);
  }

  return p;
}

/* Print an expression for the size of "array" in bytes.
 */
__isl_give isl_printer *autosa_array_info_print_serialize_size(
    __isl_take isl_printer *p, struct autosa_array_info *array)
{
  p = isl_printer_print_str(p, "(");
  p = isl_printer_print_pw_qpolynomial(p, array->local_array->serialize_bound);
  if (array->local_array->is_sparse) {
    p = isl_printer_print_str(p, " / ");
    p = isl_printer_print_double(p, (double)array->local_array->eff_compress_ratio);
  }
  p = isl_printer_print_str(p, ") * ");
  p = isl_printer_print_str(p, "sizeof(");
  p = isl_printer_print_str(p, array->type);
  p = isl_printer_print_str(p, ")");

  return p;
}

__isl_give isl_printer *autosa_print_array_type(__isl_take isl_printer *p,
                                                struct autosa_array_info *array)
{
  int n_lane = array->n_lane;
  if (n_lane == 1)
    p = isl_printer_print_str(p, array->type);
  else
  {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  }

  return p;
}

__isl_give isl_printer *autosa_print_array_type_with_lane(
  __isl_take isl_printer *p,
  struct autosa_array_info *array, int n_lane)
{
  //if (n_lane == 1)
  //  p = isl_printer_print_str(p, array->type);
  //else {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  //}
  return p;
}

__isl_give isl_printer *autosa_print_array_type_with_lane_sparse(
  __isl_take isl_printer *p,
  struct autosa_array_info *array, int n_lane)
{
  p = isl_printer_print_str(p, array->name);
  p = isl_printer_print_str(p, "_s_t");
  p = isl_printer_print_int(p, n_lane);

  return p;
}

__isl_give isl_printer *autosa_kernel_print_domain(__isl_take isl_printer *p,
                                                   struct autosa_kernel_stmt *stmt)
{
  return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
}

/* Print the declaration of a non-linearized array argument.
 */
static __isl_give isl_printer *print_non_linearized_declaration_argument(
    __isl_take isl_printer *p, struct autosa_array_info *array, int n_lane)
{
  if (n_lane == 1)
  {
    p = isl_printer_print_str(p, array->type);
    p = isl_printer_print_str(p, " ");

    p = isl_printer_print_ast_expr(p, array->bound_expr);
  }
  else
  {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
    p = isl_printer_print_str(p, " ");

    p = isl_printer_print_ast_expr(p, array->bound_expr);
  }

  return p;
}

/* Print the declaration of an array argument.
 * "memory_space" allows to specify a memory space prefix.
 */
__isl_give isl_printer *autosa_array_info_print_declaration_argument(
    __isl_take isl_printer *p, struct autosa_array_info *array, int n_lane,
    const char *memory_space, int n_ref, char *mem_port_map, enum platform target)
{
  int mem_port = -1;
  if (mem_port_map) {
    /* This is only for Intel HBM. We will assign the different array to different HBM channel. */
    isl_union_map *umap;

    umap = extract_sizes_from_str(isl_printer_get_ctx(p), mem_port_map);
    mem_port = read_mem_port_map(umap, array->name);
    isl_union_map_free(umap);
  }

  if (autosa_array_is_read_only_scalar(array))
  {
    p = isl_printer_print_str(p, array->type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, array->name);
    return p;
  }

  if (memory_space)
  {
    p = isl_printer_print_str(p, memory_space);
    p = isl_printer_print_str(p, " ");
  }
  if (mem_port != -1) {
    p = isl_printer_print_str(p, "__attribute__((buffer_location(\"HBM");
    p = isl_printer_print_int(p, mem_port);
    p = isl_printer_print_str(p, "\"))) ");
  }

  if (array->n_index != 0 && !array->linearize)
    return print_non_linearized_declaration_argument(p, array, n_lane);

  if (target == TAPA_HW) {
    if (array->copy_in) {
      if (array->copy_out)
        p = isl_printer_print_str(p, "tapa::read_write_mmap<");
      else
        p = isl_printer_print_str(p, "tapa::read_only_mmap<");
    } else if (array->copy_out)
      p = isl_printer_print_str(p, "tapa::write_only_mmap<");
    else
      p = isl_printer_print_str(p, "tapa::placeholder_mmap<");
  }

  //if (n_lane == 1)
  //  p = isl_printer_print_str(p, array->type);
  //else
  //{
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  //}
  if (target == TAPA_HW)
    p = isl_printer_print_str(p, ">");
  p = isl_printer_print_str(p, " ");
  if (target != CATAPULT_HW && target != TAPA_HW)
    p = isl_printer_print_str(p, "*");
  if (target == INTEL_HW)
    p = isl_printer_print_str(p, "restrict ");

  p = isl_printer_print_str(p, array->name);
  if (n_ref >= 0)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_int(p, n_ref);
  }
  if (target == CATAPULT_HW) {
    if (array->local_array->host_serialize) {
      p = isl_printer_print_str(p, "[");
      p = isl_printer_print_pw_qpolynomial(p, array->local_array->serialize_bound);
      p = isl_printer_print_str(p, " / ");      
      p = isl_printer_print_int(p, n_lane);
      p = isl_printer_print_str(p, "]");      
    } else {
      throw std::runtime_error("[AutoSA] Error: Non-serialized array not supported for Catapult HLS yet.");
    }
  }

  return p;
}

/* Print the arguments to a kernel declaration or call.  If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the arrays accessed by the kernel
 * - the parameters
 * - the host loop iterators
 */
__isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
                                               struct autosa_prog *prog, 
                                               struct autosa_kernel *kernel,
                                               int types, struct hls_info *hls)
{
  int i, n;
  int first = 1;
  unsigned nparam;
  isl_space *space;
  const char *type;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;

  /* Arrays */
  for (i = 0; i < kernel->n_array; ++i)
  {
    int required;
    int n_lane;

    required = autosa_kernel_requires_array_argument(kernel, i);
    if (required < 0)
      return isl_printer_free(p);
    if (!required)
      continue;

    struct autosa_local_array_info *local_array = &kernel->array[i];
    n_lane = local_array->n_lane;
    if (hls->target == INTEL_HW ||
        hls->target == CATAPULT_HW ||
        (hls->target == TAPA_HW && local_array->n_io_group_refs == 1) ||
        (hls->target == XILINX_HW && local_array->n_io_group_refs == 1))
    {
      if (!first)
        p = isl_printer_print_str(p, ", ");

      if (types) {
        if (prog->scop->options->autosa->axi_stream) {
          p = autosa_fifo_print_declaration_arguments(p, local_array->io_groups[0], n_lane, NULL, hls->target, fifo_depth, NULL);
        } else {
          p = autosa_array_info_print_declaration_argument(
                p, local_array->array, n_lane, NULL, -1, NULL, hls->target);
        }        
      } else {                
        if (prog->scop->options->autosa->axi_stream) {
          p = autosa_array_info_print_call_argument(p,
                                                    local_array->array, -1, "fifo");
        } else {
          p = autosa_array_info_print_call_argument(p,
                                                    local_array->array, 0, "buffer");
          if (hls->target == TAPA_HW) {
            p = isl_printer_print_str(p, ".vectorized<");
            p = isl_printer_print_int(p, n_lane);
            p = isl_printer_print_str(p, ">()");
          }
        }
      }

      first = 0;
    }
    else
    {
      for (int j = 0; j < local_array->n_io_group_refs; j++)
      {
        if (!first)
          p = isl_printer_print_str(p, ", ");

        if (types)
          p = autosa_array_info_print_declaration_argument(
                p, local_array->array, n_lane, NULL, j, NULL, hls->target);
        else
        {
          p = autosa_array_info_print_call_argument(p,
                                                    local_array->array, j, "buffer");
          if (hls->target == TAPA_HW) {
            p = isl_printer_print_str(p, ".vectorized<");
            p = isl_printer_print_int(p, n_lane);
            p = isl_printer_print_str(p, ">()");
          }
        }

        first = 0;
      }
    }
  }

  /* Parameters */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
      p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* Host loop iterators */
  n = isl_space_dim(kernel->space, isl_dim_set);
  type = isl_options_get_ast_iterator_type(prog->ctx);
  for (i = 0; i < n; ++i)
  {
    const char *name;

    if (!first)
      p = isl_printer_print_str(p, ", ");
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, name);

    first = 0;
  }

  return p;
}

/* Print the header of the given kernel.
 */
__isl_give isl_printer *print_kernel_header(
  __isl_take isl_printer *p, struct autosa_prog *prog, 
  struct autosa_kernel *kernel, struct hls_info *hls, int types)
{
  p = isl_printer_start_line(p);
  if (types)
    p = isl_printer_print_str(p, "void ");
  if (hls->hcl) 
    p = isl_printer_print_str(p, "autosa_func");
  else
    p = isl_printer_print_str(p, "kernel0");
  //p = isl_printer_print_int(p, kernel->id);
  p = isl_printer_print_str(p, "(");
  p = print_kernel_arguments(p, prog, kernel, types, hls);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* This function is called for each node in a AutoSA AST.
 * In case of a user node, print the macro definitions required
 * for printing the AST expressions in the annotation, if any.
 * For other nodes, return true such that descendants are also
 * visited.
 *
 * In particular, for a kernel launch, print the macro definitions
 * needed for the grid size.
 * For a copy statement, print the macro definitions needed
 * for the two index expressions.
 * For an original user statement, print the macro definitions
 * needed for the substitutions.
 */
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
{
  const char *name;
  isl_id *id;
  int is_kernel;
  struct autosa_kernel *kernel;
  struct autosa_kernel_stmt *stmt;
  isl_printer **p = (isl_printer **)user;

  if (isl_ast_node_get_type(node) != isl_ast_node_user)
    return isl_bool_true;

  id = isl_ast_node_get_annotation(node);
  if (!id)
    return isl_bool_false;

  name = isl_id_get_name(id);
  if (!name)
    return isl_bool_error;
  is_kernel = !strcmp(name, "kernel");
  kernel = is_kernel ? (struct autosa_kernel *)isl_id_get_user(id) : NULL;
  stmt = is_kernel ? NULL : (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  if ((is_kernel && !kernel) || (!is_kernel && !stmt))
    return isl_bool_error;

  if (is_kernel)
  {
    *p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
  }
  else if (stmt->type == AUTOSA_KERNEL_STMT_COPY)
  {
    *p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
    *p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
  }
  else if (stmt->type == AUTOSA_KERNEL_STMT_DOMAIN)
  {
    *p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
  }
  if (!*p)
    return isl_bool_error;

  return isl_bool_false;
}

static void print_indent(FILE *dst, int indent)
{
  fprintf(dst, "%*s", indent, "");
}

/* Print a list of iterators of type "type" with names "ids" to "out".
 * Each iterator is assigned one of the instance identifiers in dims.
 */
static __isl_give isl_printer *print_iterators(
  __isl_take isl_printer *p, 
  FILE *out, const char *type,
  __isl_keep isl_id_list *ids, const char *dims[])
{
  int i, n;

  n = isl_id_list_n_id(ids);
  if (n <= 0)
    return p;
  //print_indent(out, 2);
  //fprintf(out, "%s ", type);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, type);
  p = isl_printer_print_str(p, " ");
  for (i = 0; i < n; ++i)
  {
    isl_id *id;

    if (i)
      p = isl_printer_print_str(p, ", ");
      //fprintf(out, ", ");
    id = isl_id_list_get_id(ids, i);
    //fprintf(out, "%s = %s", isl_id_get_name(id),
    //        dims[i]);
    p = isl_printer_print_str(p, isl_id_get_name(id));
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, dims[i]);
    isl_id_free(id);
  }
  //fprintf(out, "; // module id\n");
  p = isl_printer_print_str(p, "; // module id");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the required macros for the AutoSA AST "node" to "p",
 * including those needed for the user statements inside the AST.
 */
__isl_give isl_printer *autosa_print_macros(__isl_take isl_printer *p,
                                            __isl_keep isl_ast_node *node)
{
  if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
    return isl_printer_free(p);
  p = ppcg_print_macros(p, node);
  return p;
}

__isl_give isl_printer *print_module_iterators(
  __isl_take isl_printer *p, FILE *out, struct autosa_hw_module *module)
{
  isl_ctx *ctx;
  const char *type;
  const char *dims[] = {"idx", "idy", "idz"};

  ctx = isl_ast_node_get_ctx(module->tree);
  type = isl_options_get_ast_iterator_type(ctx);
  p = print_iterators(p, out, type, module->inst_ids, dims);

  return p;
}

__isl_give isl_printer *print_func_iterators(
  __isl_take isl_printer *p,
  FILE *out, struct autosa_drain_merge_func *func)
{
  isl_ctx *ctx;
  const char *type;
  const char *dims[] = {"idx", "idy", "idz"};

  ctx = isl_ast_node_get_ctx(func->tree);
  type = isl_options_get_ast_iterator_type(ctx);
  p = print_iterators(p, out, type, func->inst_ids, dims);
  return p;
}

__isl_give isl_printer *print_serialize_counter(
  __isl_take isl_printer *p, struct autosa_hw_module *module)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "unsigned int ");
  p = isl_printer_print_str(p, module->io_groups[0]->array->name);
  p = isl_printer_print_str(p, "_cnt = 0;");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the arguments to a host serialization functioin declaration or call.
 * If "types" is set, then print a declaration (including the types of the arguments).
 * 
 * The arguments are printed in the following order:
 * - the moduler identifiers
 * - the paramters
 * - the host loop iterators
 * - the input array accessed by the module (before serialization/deserialization)
 * - the output array accessed by the module (after serialization/deserialization)
 */
__isl_give isl_printer *print_host_serialize_arguments(
  __isl_take isl_printer *p,
  struct autosa_kernel *kernel,
  struct autosa_array_ref_group *group,
  struct autosa_hw_module *module,
  int types,
  int hls)
{
  int first = 1;
  int nparam;
  int n;
  isl_space *space;
  const char *type;
  struct autosa_local_array_info *local_array;

  type = isl_options_get_ast_iterator_type(kernel->ctx);
  /* module identifiers */
  const char *dims[] = {"idx", "idy", "idz"};
  n = isl_id_list_n_id(module->inst_ids);
  for (int i = 0; i < n; ++i)
  {
    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, dims[i]);

    first = 0;
  }

  /* params */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
      p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* Host iters */
  n = isl_space_dim(kernel->space, isl_dim_set);
  for (int i = 0; i < n; ++i)
  {
    const char *name;

    if (!first)
      p = isl_printer_print_str(p, ", ");
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, name);

    first = 0;
  }

  /* Arrays */
  local_array = group->local_array;
  if (!first)
    p = isl_printer_print_str(p, ", ");
  if (types)    
  {
    if (hls)
    {
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *");
    }
    else 
    {
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> &");
    }
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_to");
  }
  else 
  {    
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    if (!module->in) {
      p = isl_printer_print_str(p, "_unserialized");
    }    
  }
  first = 0;

  if (!first)
    p = isl_printer_print_str(p, ", ");
  if (types)
  {
    if (hls)
    {
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *");
    }
    else
    {
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> &");
    }
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_from");
  }
  else
  {    
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    if (module->in) {
      p = isl_printer_print_str(p, "_unserialized");
    }    
  }
  first = 0;

  return p;  
}

/* Print out
 * "hls::stream<[type]>"
 */
__isl_give isl_printer *print_fifo_type_xilinx(__isl_take isl_printer *p,
                                               struct autosa_array_ref_group *group, int n_lane)
{
  struct autosa_array_info *array = group->array;

  p = isl_printer_print_str(p, "hls::stream<");
  if (group->local_array->is_sparse) {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, n_lane);
  } else {
    if (n_lane == 1) {
      p = isl_printer_print_str(p, group->array->type);
    } else {    
      p = isl_printer_print_str(p, array->name);    
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, n_lane);
    }
  }
  p = isl_printer_print_str(p, ">");

  return p;
}

/* Print out
 * "ac_channel<[type]>"
 */
__isl_give isl_printer *print_fifo_type_catapult(__isl_take isl_printer *p,
                                                 struct autosa_array_ref_group *group, int n_lane)
{
  struct autosa_array_info *array = group->array;

  p = isl_printer_print_str(p, "ac_channel<");
  if (group->local_array->is_sparse) {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, n_lane);
  } else {
    //if (n_lane == 1) {
    //  p = isl_printer_print_str(p, group->array->type);
    //} else {    
      p = isl_printer_print_str(p, array->name);    
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, n_lane);
    //}
  }
  p = isl_printer_print_str(p, ">");

  return p;
}

/* Print out
 * "channel [type]"
 */
__isl_give isl_printer *print_fifo_type_intel(__isl_take isl_printer *p,
                                              struct autosa_array_ref_group *group, int n_lane)
{
  p = isl_printer_print_str(p, "channel ");
  if (n_lane == 1)
    p = isl_printer_print_str(p, group->array->type);
  else
  {
    p = isl_printer_print_str(p, group->array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  }

  return p;
}

/* Print out
 * "tapa::[i/o]stream<[type], [depth]>"
 */
__isl_give isl_printer *print_fifo_type_tapa(__isl_take isl_printer *p,
                                             struct autosa_array_ref_group *group,
                                             int n_lane, int fifo_depth, const char *direction)
{
  struct autosa_array_info *array = group->array;

  p = isl_printer_print_str(p, "tapa::");
  if (direction) {
    p = isl_printer_print_str(p, direction);
  }
  p = isl_printer_print_str(p, "stream<");
  if (group->local_array->is_sparse) {
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, n_lane);
  } else {
    if (n_lane == 1) {
      p = isl_printer_print_str(p, group->array->type);
    } else {
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, n_lane);
    }
  }
  if (!direction) {
    p = isl_printer_print_str(p, ", ");
    p = isl_printer_print_int(p, fifo_depth);
  }
  p = isl_printer_print_str(p, ">");

  return p;
}


/* If disable prefix is asserted, do not print "fifo" prefix. 
 */
__isl_give isl_printer *autosa_fifo_print_declaration_arguments(
    __isl_take isl_printer *p, struct autosa_array_ref_group *group, int n_lane,
    const char *suffix, enum platform target, int fifo_depth, const char *direction)
{
  if (target == XILINX_HW)
  {
    p = print_fifo_type_xilinx(p, group, n_lane);
    p = isl_printer_print_str(p, " &");
  } else if (target == TAPA_HW)
  {
    p = print_fifo_type_tapa(p, group, n_lane, fifo_depth, direction);
    p = isl_printer_print_str(p, " &");
  } else if (target == INTEL_HW)
  {
    p = print_fifo_type_intel(p, group, n_lane);
    p = isl_printer_print_str(p, " ");
  } else if (target == CATAPULT_HW) 
  {
    p = print_fifo_type_catapult(p, group, n_lane);
    p = isl_printer_print_str(p, " &");
  }
  p = autosa_array_ref_group_print_fifo_name(group, p);
  if (suffix)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, suffix);
  }

  return p;
}

__isl_give isl_printer *autosa_fifo_print_call_argument(
    __isl_take isl_printer *p, struct autosa_array_ref_group *group,
    const char *suffix, enum platform target)
{
  p = autosa_array_ref_group_print_fifo_name(group, p);
  if (suffix)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, suffix);
  }

  return p;
}

/* Print the call of an array argument in the module.
 */
__isl_give isl_printer *autosa_module_array_info_print_call_argument(
  __isl_take isl_printer *p, struct autosa_array_info *array)
{
  if (autosa_array_is_read_only_scalar(array))
    return isl_printer_print_str(p, array->name);

  p = isl_printer_print_str(p, array->name);

  return p;
}

/* Print the variable initialization. */
__isl_give isl_printer *autosa_print_var_initialization(
  __isl_take isl_printer *p, struct autosa_kernel_var *var,
  enum platform target)
{  
  for (int i = 0; i < isl_vec_size(var->size); ++i) {
    isl_val *extent;

    if (target == CATAPULT_HW)
      p = print_str_new_line(p, "// hls_pipeline");    

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int c");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, " = 0; c");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, " < ");
    extent = isl_vec_get_element_val(var->size, i);
    p = isl_printer_print_val(p, extent);
    isl_val_free(extent);
    p = isl_printer_print_str(p, "; c");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, "++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
  }
  
  if (target == XILINX_HW || target == TAPA_HW)
    p = print_str_new_line(p, "// hls_pipeline");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, var->name);
  for (int i = 0; i < isl_vec_size(var->size); ++i) {
    p = isl_printer_print_str(p, "[c");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, "]");
  }
  p = isl_printer_print_str(p, " = 0;");
  p = isl_printer_end_line(p);
  for (int i = 0; i < isl_vec_size(var->size); ++i) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  return p;  
}

/* Print the arguments to a module declaration or call. If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the module identifiers
 * - the parameters
 * - the host loop iterators
 * - the arrays accessed by the module
 * - the fifos
 * - the enable signal
 * 
 * If module is to_mem with serialize set as 0, we will replace the arrays 
 * by a serialize fifo.
 */
__isl_give isl_printer *print_module_arguments(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_hw_module *module, int types,
    enum platform target,
    int inter, int arb, int boundary, int serialize)
{
  int first = 1;
  isl_space *space;
  int nparam;
  int n;
  const char *type;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;

  type = isl_options_get_ast_iterator_type(prog->ctx);
  /* Module identifiers */
  const char *dims[] = {"idx", "idy", "idz"};
  n = isl_id_list_n_id(module->inst_ids);
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    for (int i = 0; i < n; ++i)
    {
      if (!first)
      {
        p = isl_printer_print_str(p, ", ");
        if (!types)
        {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      if (types)
      {
        p = isl_printer_print_str(p, type);
        p = isl_printer_print_str(p, " ");
      }
      if (!types)
      {
        p = isl_printer_print_str(p, "/* module id */ ");
      }
      p = isl_printer_print_str(p, dims[i]);
      first = 0;
    }
  }

  /* params */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
    {
      p = isl_printer_print_str(p, ", ");
      if (!types)
      {
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
      }
    }
    if (types)
      p = isl_printer_print_str(p, "int ");
    if (!types)
      p = isl_printer_print_str(p, "/* param */ ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* host iters */
  if (inter == -1)
    space = module->space;
  else if (inter == 0)
    space = module->intra_space;
  else if (inter == 1)
    space = module->inter_space;

  /* Skip printing the host iterators for inter/intra modules for Catapult HLS */
  if (!(inter >= 0 && target == CATAPULT_HW)) {
    n = isl_space_dim(space, isl_dim_set);
    for (int i = 0; i < n; ++i)
    {
      const char *name;

      if (!first)
      {
        p = isl_printer_print_str(p, ", ");
        if (!types)
        {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      name = isl_space_get_dim_name(space, isl_dim_set, i);
      if (types)
      {
        p = isl_printer_print_str(p, type);
        p = isl_printer_print_str(p, " ");
      }
      if (!types)
      {
        p = isl_printer_print_str(p, "/* host iter */ ");
      }
      p = isl_printer_print_str(p, name);
      if (module->double_buffer && inter != -1 && !types)
      {
        if (module->in && inter == 0)
        {
          /* intra trans */
          p = isl_printer_print_str(p, "_prev");
        }
        else if (!module->in && inter == 1)
        {
          /* inter trans */
          p = isl_printer_print_str(p, "_prev");
        }
      }

      first = 0;
    }
  }

  /* Arrays */
  if (module->type != PE_MODULE && module->to_mem)
  {
    if (!module->is_serialized || 
       (module->is_serialized && serialize && !prog->scop->options->autosa->axi_stream)) {
      /* If module satisfies any of the following constraints:
       * 1. not serialized 
       * 2. serialized and not using the AXI stream interface
       * the I/O module will access the external memory through array pointer. */
      struct autosa_io_buffer *io_buffer =
          module->io_groups[0]->io_buffers[module->io_groups[0]->io_level - 1];      
      int n_lane = (module->is_serialized)? module->data_pack_serialize : io_buffer->n_lane;
      if (!first)
      {
        p = isl_printer_print_str(p, ", ");
        if (!types)
        {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      if (types)
      {
        p = autosa_array_info_print_declaration_argument(
              p, module->io_groups[0]->array, n_lane,
              target == INTEL_HW ? "__global volatile" : NULL, -1, prog->scop->options->autosa->mem_port_map, target);
      }
      else
      {
        p = isl_printer_print_str(p, "/* array */ ");
        p = autosa_module_array_info_print_call_argument(p,
                                                         module->io_groups[0]->array);
      }
      first = 0;
    } else if (module->is_serialized && serialize && prog->scop->options->autosa->axi_stream && inter == -1) {
      /* The module is serialized and using the AXI stream interface,
       * the I/O module will access the external memory via a stream fifo. */
      struct autosa_io_buffer *io_buffer =
          module->io_groups[0]->io_buffers[module->io_groups[0]->io_level - 1];
      int n_lane = (module->is_serialized)? module->data_pack_serialize : io_buffer->n_lane;
      if (!first) {
        p = isl_printer_print_str(p, ", ");
        if (!types) {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      if (types) {
        p = autosa_fifo_print_declaration_arguments(p,
                                                    module->io_groups[0], n_lane, NULL, target, fifo_depth, NULL);
      } else {
        p = isl_printer_print_str(p, "/* fifo */ ");
        p = autosa_fifo_print_call_argument(p,  
                                            module->io_groups[0], NULL, target);
      }
      first = 0;
    } else if (inter == -1) {
      /* The module is serialized and connected to another stream header module,
       * print a normal FIFO interface here. */
      int n_lane = module->data_pack_inter;
      if (!first) {
        p = isl_printer_print_str(p, ", ");
        if (!types) {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      if (types) {
        //p = autosa_fifo_print_declaration_arguments(p,
        //                                            module->io_groups[0], n_lane, "serialize", target, fifo_depth);
        p = autosa_fifo_print_declaration_arguments(p,
                                                    module->io_groups[0], n_lane, (module->in)? "in" : "out", target, fifo_depth,
                                                    (module->in)? "i" : "o");
      } else {
        p = isl_printer_print_str(p, "/* fifo */ ");
        //p = autosa_fifo_print_call_argument(p,  
        //                                    module->io_groups[0], "serialize", target);
        p = autosa_fifo_print_call_argument(p,  
                                            module->io_groups[0], (module->in)? "in" : "out", target);
      }
      first = 0;
    }
  } else if (module->type == PE_MODULE) {
    /* Scalars */
    for (int i = 0; i < prog->n_array; i++)
    {
      int required;

      required = autosa_kernel_requires_array_argument(kernel, i);
      if (required < 0)
        return isl_printer_free(p);
      if (!required)
        continue;

      if (autosa_array_is_read_only_scalar(&prog->array[i]))
      {
        if (!first)
        {
          p = isl_printer_print_str(p, ", ");
          if (!types)
          {
            p = isl_printer_end_line(p);
            p = isl_printer_start_line(p);
          }
        }
        if (types)
          p = autosa_array_info_print_declaration_argument(
                p, &prog->array[i], 1, NULL, -1, NULL, target);
        else
        {
          p = isl_printer_print_str(p, "/* scalar */ ");
          p = autosa_array_info_print_call_argument(p,
                                                    &prog->array[i], -1, "buffer");
        }
        first = 0;
      }
    }
  }

  /* Local buffer */
  if (inter != -1)
  {
    for (int i = 0; i < module->n_var; i++)
    {
      struct autosa_kernel_var *var;

      var = (struct autosa_kernel_var *)&module->var[i];
      if (!first)
      {
        p = isl_printer_print_str(p, ", ");
        if (!types)
        {
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
        }
      }
      if (types)
      {
        if (target == CATAPULT_HW) {
          p = isl_printer_print_str(p, "ac_channel<");
          p = isl_printer_print_str(p, module->name);
          p = isl_printer_print_str(p, "_");
          p = isl_printer_print_str(p, var->name);
          p = isl_printer_print_str(p, "> &");
          p = isl_printer_print_str(p, var->name);          
        } else {
          if (module->data_pack_inter == 1 && module->io_groups[0]->local_array->is_sparse == 0) {
            p = isl_printer_print_str(p, var->array->type);
          }
          else {
            p = isl_printer_print_str(p, var->array->name);
            if (var->array->local_array->is_sparse)
              p = isl_printer_print_str(p, "_s");
            p = isl_printer_print_str(p, "_t");
            p = isl_printer_print_int(p, module->data_pack_inter);
          }
          p = isl_printer_print_str(p, " ");
          p = isl_printer_print_str(p, var->name);
          for (int j = 0; j < isl_vec_size(var->size); j++) {
            isl_val *v;
            p = isl_printer_print_str(p, "[");
            v = isl_vec_get_element_val(var->size, j);
            p = isl_printer_print_val(p, v);
            isl_val_free(v);
            p = isl_printer_print_str(p, "]");
          }
        }
      }
      else
      {
        p = isl_printer_print_str(p, "/* array */ ");
        if (target == CATAPULT_HW) {
          p = isl_printer_print_str(p, module->name);
          p = isl_printer_print_str(p, "_");
          p = isl_printer_print_str(p, var->name);
          p = isl_printer_print_str(p, "_inst");
        } else {
          if (!module->double_buffer)
          {
            p = isl_printer_print_str(p, var->name);
          }
          else
          {
            if (arb == 0)
            {
              p = isl_printer_print_str(p, var->name);
              p = isl_printer_print_str(p, inter == 0 ? "_ping" : "_pong");
            }
            else
            {
              p = isl_printer_print_str(p, var->name);
              p = isl_printer_print_str(p, inter == 0 ? "_pong" : "_ping");
            }
          }
        }
      }

      first = 0;
    }
  }

  /* fifos */
  if (module->type == PE_MODULE)
  {
    for (int i = 0; i < module->n_io_group; i++)
    {
      struct autosa_array_ref_group *group = module->io_groups[i];
      //if (!(group->copy_in || group->copy_out))
      //  continue;
      int n_lane = get_io_group_n_lane(module, NULL, group);
      if (module->io_groups[i]->pe_io_dir == IO_IN ||
          module->io_groups[i]->pe_io_dir == IO_INOUT)
      {
        if (!first)
        {
          p = isl_printer_print_str(p, ", ");
          if (!types)
          {
            p = isl_printer_end_line(p);
            p = isl_printer_start_line(p);
          }
        }
        if (types)
        {
          p = autosa_fifo_print_declaration_arguments(p,
                                                      module->io_groups[i], n_lane, "in", target, fifo_depth, "i");
        }
        else
        {
          p = isl_printer_print_str(p, "/* fifo */ ");
          p = autosa_fifo_print_call_argument(p,
                                              module->io_groups[i], "in", target);
        }
        first = 0;
      }
      if (module->io_groups[i]->pe_io_dir == IO_OUT ||
          module->io_groups[i]->pe_io_dir == IO_INOUT)
      {
        if (!first)
        {
          p = isl_printer_print_str(p, ", ");
          if (!types)
          {
            p = isl_printer_end_line(p);
            p = isl_printer_start_line(p);
          }
        }
        if (types)
          p = autosa_fifo_print_declaration_arguments(p,
                                                      module->io_groups[i], n_lane, "out", target, fifo_depth, "o");
        else
        {
          p = isl_printer_print_str(p, "/* fifo */ ");
          p = autosa_fifo_print_call_argument(p,
                                              module->io_groups[i], "out", target);
        }
        first = 0;
      }
    }
  }
  else {
    for (int i = 0; i < module->n_io_group; i++) {      
      if (inter == 1 || (inter == -1 && !module->to_mem)) {
      //if (!module->to_mem && (inter == 1 || inter == -1)) {
        /* inter trans or outer module or default module. */
        if (!(!module->in && boundary)) {
          /* Print in fifo. */
          if (!first) {
            p = isl_printer_print_str(p, ", ");
            if (!types) {
              p = isl_printer_end_line(p);
              p = isl_printer_start_line(p);
            }
          }
          /* in */
          if (types)
            p = autosa_fifo_print_declaration_arguments(p,
                                                        module->io_groups[i], module->data_pack_inter, "in", target, fifo_depth, "i");
          else {
            p = isl_printer_print_str(p, "/* fifo */ ");
            p = autosa_fifo_print_call_argument(p,
                                                module->io_groups[i], "in", target);
          }
          first = 0;
        }

        if (!(module->in && boundary)) {
          /* Print out fifo. */
          /* out */
          if (!first) {
            p = isl_printer_print_str(p, ", ");
            if (!types) {
              p = isl_printer_end_line(p);
              p = isl_printer_start_line(p);
            }
          }
          if (types)
            p = autosa_fifo_print_declaration_arguments(p,
                                                        module->io_groups[i], module->data_pack_inter, "out", target, fifo_depth, "o");
          else {
            p = isl_printer_print_str(p, "/* fifo */ ");
            p = autosa_fifo_print_call_argument(p,
                                                module->io_groups[i], "out", target);
          }
          first = 0;
        }
      }

      if (inter != 1) {
        if (!first) {
          p = isl_printer_print_str(p, ", ");
          if (!types) {
            p = isl_printer_end_line(p);
            p = isl_printer_start_line(p);
          }
        }
        /* local */
        if (types) {
          p = autosa_fifo_print_declaration_arguments(p,
                                                      module->io_groups[i], 
                                                      (module->is_serialized && serialize)? module->data_pack_inter : module->data_pack_intra,                                                      
                                                      module->in ? "local_out" : "local_in", target, fifo_depth,
                                                      module->in ? "o" : "i");
        } else {
          p = isl_printer_print_str(p, "/* fifo */ ");
          p = autosa_fifo_print_call_argument(p,
                                              module->io_groups[i], module->in ? "local_out" : "local_in", target);
        }
        first = 0;
      }
    }
  }

  /* credit fifo */
  if (module->credit)
  {
    if (!first)
    {
      p = isl_printer_print_str(p, ", ");
      if (!types)
      {
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
      }
    }
    if (types)
    {
      if (target == XILINX_HW)
      {
        p = isl_printer_print_str(p, "hls::stream<int> &credit");
      }
      else if (target == TAPA_HW)
      {
        p = isl_printer_print_str(p, "tapa::stream<int> &credit");
      }
      else
      {
        p = isl_printer_print_str(p, "channel int credit");
      }
    }
    else
    {
      p = isl_printer_print_str(p, "/* credit */ ");
      p = isl_printer_print_str(p, "credit");
    }

    first = 0;
  }

  /* enable signal */
  if (module->double_buffer && inter != -1 && target != CATAPULT_HW)
  {
    if (!first)
    {
      p = isl_printer_print_str(p, ", ");
      if (!types)
      {
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
      }
    }
    if (types)
    {
      p = isl_printer_print_str(p, inter == 0 ? "bool intra_trans_en" : "bool inter_trans_en");
    }
    else
    {
      p = isl_printer_print_str(p, "/* enable */ ");
      p = isl_printer_print_str(p, inter == 0 ? "intra_trans_en" : "inter_trans_en");
    }

    first = 0;
  }

  return p;
}

/* Print the arguments to a pe dummy module declaration or call. If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the module identifiers
 * - the parameters
 * - the host loop iterators 
 * - the arrays accessed by the module
 * - the fifos
 */
__isl_give isl_printer *print_pe_dummy_module_arguments(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_pe_dummy_module *pe_dummy_module,
    int types,
    enum platform target)
{
  int first = 1;
  isl_space *space;
  int nparam;
  int n;
  const char *type;
  struct autosa_hw_module *module = pe_dummy_module->module;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;

  type = isl_options_get_ast_iterator_type(prog->ctx);
  /* module identifiers */
  const char *dims[] = {"idx", "idy", "idz"};
  n = isl_id_list_n_id(module->inst_ids);
  for (int i = 0; i < n; ++i)
  {
    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, dims[i]);

    first = 0;
  }

  /* params */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
      p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* host iters */
  space = module->space;

  n = isl_space_dim(space, isl_dim_set);
  for (int i = 0; i < n; ++i)
  {
    const char *name;

    if (!first)
      p = isl_printer_print_str(p, ", ");
    name = isl_space_get_dim_name(space, isl_dim_set, i);
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, name);

    first = 0;
  }

  /* Arrays */
  /* Scalars */
  for (int i = 0; i < prog->n_array; i++)
  {
    int required;

    required = autosa_kernel_requires_array_argument(kernel, i);
    if (required < 0)
      return isl_printer_free(p);
    if (!required)
      continue;

    if (autosa_array_is_read_only_scalar(&prog->array[i]))
    {
      if (!first)
      {
        p = isl_printer_print_str(p, ", ");
      }
      if (types)
        p = autosa_array_info_print_declaration_argument(
              p, &prog->array[i], 1, NULL, -1, NULL, target);
      else
        p = autosa_module_array_info_print_call_argument(p,
                                                         &prog->array[i]);
      first = 0;
    }
  }

  /* fifos */
  struct autosa_array_ref_group *group = pe_dummy_module->io_group;
  int n_lane = get_io_group_n_lane(NULL, pe_dummy_module, group);  

  if (!first)
  {
    p = isl_printer_print_str(p, ", ");
  }
  if (types)
  {
    p = autosa_fifo_print_declaration_arguments(p,
                                                group, n_lane, pe_dummy_module->in? "in" : "out", target, fifo_depth,
                                                pe_dummy_module->in? "i" : "o");
  }
  else
    p = autosa_fifo_print_call_argument(p,
                                        group, pe_dummy_module->in? "in" : "out", target);
  first = 0;

  return p;
}

/* Print the arguments of the top_gen function:
 * - parameters
 * - host loop iterators
 * - file descriptor
 */
__isl_give isl_printer *print_top_gen_arguments(__isl_take isl_printer *p,
                                                struct autosa_prog *prog, struct autosa_kernel *kernel, int types)
{
  int i, n;
  int first = 1;
  unsigned nparam;
  isl_space *space;
  const char *type;

  /* Parameters */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
      p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* Host iterators */
  n = isl_space_dim(kernel->space, isl_dim_set);
  type = isl_options_get_ast_iterator_type(prog->ctx);
  for (i = 0; i < n; ++i)
  {
    const char *name;

    if (!first)
      p = isl_printer_print_str(p, ", ");
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, name);

    first = 0;
  }

  /* File description */
  if (!first)
    p = isl_printer_print_str(p, ", ");
  if (types)
  {
    p = isl_printer_print_str(p, "FILE *");
  }
  p = isl_printer_print_str(p, "f");

  first = 0;

  return p;
}

static __isl_give isl_printer *print_top_gen_header(__isl_take isl_printer *p,
                                                    struct autosa_prog *prog, struct autosa_hw_top_module *top)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, "top_generate");
  p = isl_printer_print_str(p, "(");
  p = print_top_gen_arguments(p, prog, top->kernel, 1);
  p = isl_printer_print_str(p, ")");

  return p;
}

void print_top_gen_headers(
    struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->top_gen_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_top_gen_header(p, prog, top);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->top_gen_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_top_gen_header(p, prog, top);
  p = isl_printer_end_line(p);
  isl_printer_free(p);
}

/* Print out
 * "\/* [module_name] FIFO *\/"
 */
static __isl_give isl_printer *print_fifo_comment(
    __isl_take isl_printer *p, struct autosa_hw_module *module)
{
  p = isl_printer_print_str(p, "/* ");
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_print_str(p, " fifo */");

  return p;
}

/* Print out
 * "_[c0 + val]"
 * Increase the "pos"th index by the value of "val"
 */
static __isl_give isl_printer *print_inst_ids_inc_suffix(
    __isl_take isl_printer *p, int n, int pos, int val)
{
  for (int i = 0; i < n; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_\");");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
    p = isl_printer_print_int(p, i);
    if (i == pos)
    {
      if (val != 0)
      {
        p = isl_printer_print_str(p, " + ");
        p = isl_printer_print_int(p, val);
      }
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print out
 * "_c0_c1"
 */
static __isl_give isl_printer *print_inst_ids_suffix(
    __isl_take isl_printer *p, int n, __isl_keep isl_vec *offset)
{
  for (int i = 0; i < n; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_\");");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
    p = isl_printer_print_int(p, i);
    if (offset)
    {
      isl_val *val = isl_vec_get_element_val(offset, i);
      if (!isl_val_is_zero(val))
      {
        p = isl_printer_print_str(p, " + ");
        p = isl_printer_print_val(p, val);
      }
      isl_val_free(val);
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* This function prints the inst ids described by "expr".
 * If the "offset" is set, it is added to the inst ids.
 */
static __isl_give isl_printer *print_pretrans_inst_ids_suffix(
    __isl_take isl_printer *p, int n_id,
    __isl_keep isl_ast_expr *expr, __isl_keep isl_vec *offset)
{
  isl_ctx *ctx = isl_ast_expr_get_ctx(expr);
  int n;

  n = isl_ast_expr_op_get_n_arg(expr);
  for (int i = 0; i < n_id; i++)
  {
    isl_ast_expr *expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
    int format;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_\");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_int(p, ");
    format = isl_printer_get_output_format(p);
    p = isl_printer_set_output_format(p, ISL_FORMAT_C);
    p = isl_printer_print_ast_expr(p, expr_i);
    p = isl_printer_set_output_format(p, format);
    if (offset)
    {
      isl_val *val = isl_vec_get_element_val(offset, i);
      if (!isl_val_is_zero(val))
      {
        p = isl_printer_print_str(p, " + ");
        p = isl_printer_print_val(p, val);
      }
      isl_val_free(val);
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    isl_ast_expr_free(expr_i);
  }

  return p;
}

static __isl_give isl_printer *print_fifo_decl_single(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
    struct hls_info *hls, int pe_inout, const char *suffix)
{
  struct autosa_hw_module *module = stmt->u.m.module;
  struct autosa_array_ref_group *group = stmt->u.m.group;
  int boundary = stmt->u.m.boundary;
  int n;
  int n_lane;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "// Count channel number");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fifo_cnt++;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "// Print channel declarations of module: ");
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
  p = print_fifo_comment(p, module);
  p = isl_printer_print_str(p, " ");
  n_lane = get_io_group_n_lane(module, NULL, group);
  if (hls->target == XILINX_HW)
    p = print_fifo_type_xilinx(p, group, n_lane);
  else if (hls->target == TAPA_HW)
    p = print_fifo_type_tapa(p, group, n_lane, fifo_depth, NULL);
  else if (hls->target == INTEL_HW)
    p = print_fifo_type_intel(p, group, n_lane);
  else if (hls->target == CATAPULT_HW)
    p = print_fifo_type_catapult(p, group, n_lane);
  p = isl_printer_print_str(p, " ");
  p = autosa_array_ref_group_print_fifo_name(group, p);
  p = isl_printer_print_str(p, "_");
  p = isl_printer_print_str(p, module->name);
  if (pe_inout)
  {
    p = isl_printer_print_str(p, suffix);
  }
  p = isl_printer_print_str(p, "\");");
  p = isl_printer_end_line(p);

  n = isl_id_list_n_id(module->inst_ids);
  if (module->type == IO_MODULE || module->type == DRAIN_MODULE)
  {
    if (boundary)
    {
      p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
    }
    else
    {
      p = print_inst_ids_suffix(p, n, NULL);
    }
  }
  else if (module->type == PE_MODULE)
  {
    if (boundary)
      p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
    else
      p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
  }
  if (hls->target == INTEL_HW)
  {
    /* Print fifo attribute */
    //p = print_str_new_line(p, "p = isl_printer_print_str(p, \" __attribute__((depth(2)))\");");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \" __attribute__((depth(");
    p = isl_printer_print_int(p, fifo_depth);
    p = isl_printer_print_str(p, ")))\");");
    p = isl_printer_end_line(p);
  }
  if (hls->target == TAPA_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"(\\\"");
    p = autosa_array_ref_group_print_fifo_name(group, p);
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, module->name);
    if (pe_inout)
    {
      p = isl_printer_print_str(p, suffix);
    }
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);

    if (module->type == IO_MODULE || module->type == DRAIN_MODULE)
    {
      if (boundary)
      {
        p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
      }
      else
      {
        p = print_inst_ids_suffix(p, n, NULL);
      }
    }
    else if (module->type == PE_MODULE)
    {
      if (boundary)
      {
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
      }
      else
      {
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
      }
    }
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"\\\")\");");
    p = isl_printer_end_line(p);
  }

  p = print_str_new_line(p, "p = isl_printer_print_str(p, \";\");");  
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  if (hls->target == XILINX_HW)
  {
    /* Print fifo pragma */
    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS STREAM variable=");
    p = autosa_array_ref_group_print_fifo_name(group, p);
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, module->name);
    if (pe_inout)
    {
      p = isl_printer_print_str(p, suffix);
    }
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);

    if (module->type == IO_MODULE || module->type == DRAIN_MODULE)
    {
      if (boundary)
      {
        p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
      }
      else
      {
        p = print_inst_ids_suffix(p, n, NULL);
      }
    }
    else if (module->type == PE_MODULE)
    {
      if (boundary)
      {
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
      }
      else
      {
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
      }
    }
    //p = print_str_new_line(p, "p = isl_printer_print_str(p, \" depth=2\");");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \" depth=");
    p = isl_printer_print_int(p, fifo_depth);
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);
    
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");

    /* If depth * width > 512 bits, HLS will use BRAM to implement FIFOs.
     * Instead, we will insert pragmas to use SRL instead.
     * Modified: Use SRL anytime.
     */
    /* Print fifo resource pragma. */
    //if (n_lane * group->array->size >= 32)
    {
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS RESOURCE variable=");
      p = autosa_array_ref_group_print_fifo_name(group, p);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, module->name);
      if (pe_inout)
      {
        p = isl_printer_print_str(p, suffix);
      }
      p = isl_printer_print_str(p, "\");");
      p = isl_printer_end_line(p);

      if (module->type == IO_MODULE || module->type == DRAIN_MODULE)
      {
        if (boundary)
        {
          p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
        }
        else
        {
          p = print_inst_ids_suffix(p, n, NULL);
        }
      }
      else if (module->type == PE_MODULE)
      {
        if (boundary)
        {
          p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
        }
        else
        {
          p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
        }
      }      
      p = print_str_new_line(p, "p = isl_printer_print_str(p, \" core=FIFO_SRL\");");      
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");
    }

    /* For sparse structure, we will need to perform data pack. */
    if (group->local_array->is_sparse) {
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS DATA_PACK variable=");
      p = autosa_array_ref_group_print_fifo_name(group, p);
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_str(p, module->name);
      if (pe_inout)
      {
        p = isl_printer_print_str(p, suffix);
      }
      p = isl_printer_print_str(p, "\");");
      p = isl_printer_end_line(p);

      if (module->type == IO_MODULE || module->type == DRAIN_MODULE)
      {
        if (boundary)
        {
          p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
        }
        else
        {
          p = print_inst_ids_suffix(p, n, NULL);
        }
      }
      else if (module->type == PE_MODULE)
      {
        if (boundary)
        {
          p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
        }
        else
        {
          p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
        }
      }                  
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");
    }    
  }

  return p;
}

/* if module->type == PE_MODULE
 *   if boundary == 0:
 *     new_inst_id = io_trans(inst_id)
 *     print [fifo_name]_[module_name]_[new_inst_id]
 *   else if boundary == 1:
 *     new_inst_id = io_trans(inst_id)
 *     print [fifo_name]_[module_name]_[new_inst_id + dep_dir]
 * if module->type == IO_MODULE:
 *     print [fifo_name]_[module_name]_[inst_id]
 */
static __isl_give isl_printer *print_fifo_decl(__isl_take isl_printer *p,
                                               struct autosa_kernel_stmt *stmt, struct autosa_prog *prog, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.m.module;
  struct autosa_array_ref_group *group = stmt->u.m.group;
  int pe_inout;

  if (group->io_type == AUTOSA_INT_IO && module->type == PE_MODULE && group->pe_io_dir == IO_INOUT)
  {
    pe_inout = 1;
  }
  else
  {
    pe_inout = 0;
  }

  if (pe_inout)
  {
    p = print_fifo_decl_single(p, stmt, prog, hls, 1, "_in");
    p = print_fifo_decl_single(p, stmt, prog, hls, 1, "_out");
  }
  else
  {
    p = print_fifo_decl_single(p, stmt, prog, hls, 0, NULL);
  }

  return p;
}

__isl_give isl_printer *autosa_kernel_print_fifo_decl(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog, struct hls_info *hls)
{
  p = ppcg_start_block(p);

  /* Build the fifo_decl. */
  p = print_fifo_decl(p, stmt, prog, hls);

  p = ppcg_end_block(p);

  return p;
}

static __isl_give isl_printer *print_delimiter(__isl_take isl_printer *p,
                                               int *first)
{
  if (!(*first))
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \",\");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
    p = isl_printer_end_line(p);
  }
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  *first = 0;

  return p;
}

static __isl_give isl_printer *print_fifo_annotation(__isl_take isl_printer *p)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* fifo */ \");");
  p = isl_printer_end_line(p);

  return p;
}

/* Print out
 * [fifo_name]_[module_name]
 */
static __isl_give isl_printer *print_fifo_prefix(__isl_take isl_printer *p,
                                                 struct autosa_hw_module *module, struct autosa_array_ref_group *group)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
  p = autosa_array_ref_group_print_fifo_name(group, p);
  p = isl_printer_print_str(p, "_");
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_print_str(p, "\");");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the upper body of the module call, including:
 * - module identifier
 * - parameters
 * - host loop iterators
 * - arrays
 * - inter-module fifos
 */
__isl_give isl_printer *print_module_call_upper(__isl_take isl_printer *p,
                                                struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
                                                enum platform target)
{
  struct autosa_hw_module *module = stmt->u.m.module;
  struct autosa_pe_dummy_module *pe_dummy_module = stmt->u.m.pe_dummy_module;
  int lower = stmt->u.m.lower;
  int upper = stmt->u.m.upper;
  int boundary = stmt->u.m.boundary;
  int serialize = stmt->u.m.serialize;
  int dummy = stmt->u.m.dummy;
  int first = 1;
  int n;
  char *module_name = stmt->u.m.module_name;
  isl_space *space;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "// Print calls of module: ");
  p = isl_printer_print_str(p, module_name);
  if (boundary) {
    p = isl_printer_print_str(p, "_boundary");
  }
  if (serialize) {
    p = isl_printer_print_str(p, "_serialize");
  }
  p = isl_printer_end_line(p);

  if (dummy && stmt->u.m.lower_sched_val != -1) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int c");
    p = isl_printer_print_int(p, isl_id_list_n_id(module->inst_ids) - 1);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_int(p, stmt->u.m.lower_sched_val);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  if (target == TAPA_HW)
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \".invoke(\");");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
  p = isl_printer_print_str(p, module_name);
  if (boundary) {
    p = isl_printer_print_str(p, "_boundary");
  }
  if (serialize) {
    p = isl_printer_print_str(p, "_serialize");
  }  

  if (target == XILINX_HW || target == TAPA_HW) {
    if (!dummy && module->type == PE_MODULE)
      p = isl_printer_print_str(p, "_wrapper");
    else if (module->type != PE_MODULE && module->level == 1)
      p = isl_printer_print_str(p, "_wrapper");
  }
  if (target == CATAPULT_HW) {
    p = isl_printer_print_str(p, "_inst\");");
    /* Print module ids if any */
    if (isl_id_list_n_id(module->inst_ids) > 0) {
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++)
      {
        p = print_str_new_line(p, "p = isl_printer_print_str(p, \"_\");");
        p = isl_printer_start_line(p);        
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \".run");    
  }
  p = isl_printer_print_str(p, "\");");
  p = isl_printer_end_line(p);

  if (isl_id_list_n_id(module->inst_ids) > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \"<\");");    
    if (!dummy) {
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++) {
        if (i > 0) {          
          p = print_str_new_line(p, "p = isl_printer_print_str(p, \", \");");
        }
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    } else {
      isl_ast_expr *expr = pe_dummy_module->io_group->io_L1_pe_expr;
      int n_arg = isl_ast_expr_op_get_n_arg(expr);
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++) {
        if (i > 0) {          
          p = print_str_new_line(p, "p = isl_printer_print_str(p, \", \");");
        }
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, ");
        isl_ast_expr *expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
        p = isl_printer_print_ast_expr(p, expr_i);
        isl_ast_expr_free(expr_i);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \">\");");
  }

  if (target != TAPA_HW)
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \"(\");");  
  else
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \",\");");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, 2);");
  p = isl_printer_end_line(p);

  /* module identifiers */
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    if (!dummy)
    {
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++)
      {
        p = print_delimiter(p, &first);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* module id */ \");");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    else
    {
      isl_ast_expr *expr = pe_dummy_module->io_group->io_L1_pe_expr;
      int n_arg = isl_ast_expr_op_get_n_arg(expr);
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++)
      {
        int format;
        p = print_delimiter(p, &first);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* module id */ \");");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, ");
        
        isl_ast_expr *expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
        p = isl_printer_print_ast_expr(p, expr_i);
        isl_ast_expr_free(expr_i);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }

  /* params */
  space = isl_union_set_get_space(module->kernel->arrays);
  n = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < n; i++)
  {
    p = print_delimiter(p, &first);

    const char *name = isl_space_get_dim_name(space, isl_dim_set, i);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* param */");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);
  }
  isl_space_free(space);

  /* host iterators */
  n = isl_space_dim(module->kernel->space, isl_dim_set);
  for (int i = 0; i < n; i++)
  {
    p = print_delimiter(p, &first);

    const char *name = isl_space_get_dim_name(module->kernel->space, isl_dim_set, i);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* host iter */ ");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);
  }

  /* scalar and arrays */
  if (module->type != PE_MODULE && module->to_mem && 
      ((module->is_serialized && serialize) || !module->is_serialized))
  {
    p = print_delimiter(p, &first);

    p = isl_printer_start_line(p);
    if (prog->scop->options->autosa->axi_stream) {
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* fifo */ ");    
      p = isl_printer_print_str(p, "fifo_");    
      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
    } else {
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* array */ ");    
      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
    }
    if (module->io_groups[0]->local_array->n_io_group_refs > 1)
    {
      if (module->io_groups[0]->n_mem_ports == 1)
      {
        /* Print A_[module_n_array_ref] */
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_int(p, module->n_array_ref);
        p = isl_printer_print_str(p, "\");");
        p = isl_printer_end_line(p);
      }
      else
      {
        /* Print A_[module_n_array_ref + c0] */
        p = isl_printer_print_str(p, "_\");");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c0 + ");
        p = isl_printer_print_int(p, module->n_array_ref);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    else
    {
      p = isl_printer_print_str(p, "\");");
      p = isl_printer_end_line(p);
    }
  }
  else if (module->type == PE_MODULE)
  {
    for (int i = 0; i < prog->n_array; i++)
    {
      int required;

      required = autosa_kernel_requires_array_argument(module->kernel, i);
      if (required < 0)
        return isl_printer_free(p);
      if (!required)
        continue;

      if (autosa_array_is_read_only_scalar(&prog->array[i]))
      {
        p = print_delimiter(p, &first);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* scalar */ ");
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "\");");
        p = isl_printer_end_line(p);
      }
    }
  }

  /* FIFO */
  n = isl_id_list_n_id(module->inst_ids);
  if (module->type == PE_MODULE)
  {
    if (dummy)
    {
      struct autosa_array_ref_group *group = pe_dummy_module->io_group;
      p = print_delimiter(p, &first);
      p = print_fifo_annotation(p);
      p = print_fifo_prefix(p, module, group);
      if (isl_vec_is_zero(group->dir))
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_in\")");
        p = isl_printer_end_line(p);
      }
      if (pe_dummy_module->in)
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, group->dir);
      else
        p = print_pretrans_inst_ids_suffix(p, n, group->io_L1_pe_expr, NULL);
    }
    else
    {
      for (int i = 0; i < module->n_io_group; i++)
      {
        struct autosa_array_ref_group *group = module->io_groups[i];
        if (group->pe_io_dir == IO_NULL)
          continue;
        if (group->pe_io_dir == IO_INOUT)
        {
          p = print_delimiter(p, &first);
          p = print_fifo_annotation(p);
          p = print_fifo_prefix(p, module, group);          
          if (group->io_type == AUTOSA_INT_IO)
          {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_in\");");
            p = isl_printer_end_line(p);
          }
          p = print_inst_ids_suffix(p, n, NULL);

          p = print_delimiter(p, &first);
          p = print_fifo_annotation(p);
          p = print_fifo_prefix(p, module, group);          
          if (group->io_type == AUTOSA_INT_IO)
          {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_out\");");
            p = isl_printer_end_line(p);
          }          
          if (group->io_type == AUTOSA_INT_IO)
          {
            p = print_inst_ids_suffix(p, n, NULL);
          }
          else
          {
            p = print_inst_ids_suffix(p, n, group->dir);
          }
        }
        else
        {
          p = print_delimiter(p, &first);
          p = print_fifo_annotation(p);
          p = print_fifo_prefix(p, module, group);
          p = print_inst_ids_suffix(p, n, NULL);
        }
      }
    }
  }
  else
  {
    if (!module->to_mem)
    {
      for (int i = 0; i < module->n_io_group; i++)
      {
        struct autosa_array_ref_group *group = module->io_groups[i];
        if (module->in)
        {
          p = print_delimiter(p, &first);
          p = print_fifo_annotation(p);
          p = print_fifo_prefix(p, module, group);
          p = print_inst_ids_suffix(p, n, NULL);

          if (!boundary)
          {
            p = print_delimiter(p, &first);
            p = print_fifo_annotation(p);
            p = print_fifo_prefix(p, module, group);
            p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
          }
        }
        else
        {
          if (!boundary)
          {
            p = print_delimiter(p, &first);
            p = print_fifo_annotation(p);
            p = print_fifo_prefix(p, module, group);
            p = print_inst_ids_inc_suffix(p, n, n - 1, 1);
          }

          p = print_delimiter(p, &first);
          p = print_fifo_annotation(p);
          p = print_fifo_prefix(p, module, group);
          p = print_inst_ids_suffix(p, n, NULL);
        }
      }
    } else {
      if (module->is_serialized && !serialize) {
        struct autosa_array_ref_group *group = module->io_groups[0];
        p = print_delimiter(p, &first);
        p = print_fifo_annotation(p);
        p = print_fifo_prefix(p, module, group);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_serialize\");");
        p = isl_printer_end_line(p);
      }
    }
  }

  return p;
}

/* Build the lower-level module name to the current "module".
 */
static char *build_io_module_lower_name(struct autosa_hw_module *module)
{
  struct autosa_array_ref_group *group = module->io_groups[0];

  isl_printer *p = isl_printer_to_str(module->kernel->ctx);
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_IO_L");
  p = isl_printer_print_int(p, module->level - 1);
  if (module->in)
    p = isl_printer_print_str(p, "_in");
  else
    p = isl_printer_print_str(p, "_out");

  char *name = isl_printer_get_str(p);
  isl_printer_free(p);

  return name;
}

/* Print the prefix of fifos to the lower-level modules. 
 */
static __isl_give isl_printer *print_fifo_prefix_lower(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_array_ref_group *group)
{
  int lower_is_PE;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
  p = autosa_array_ref_group_print_fifo_name(group, p);
  p = isl_printer_print_str(p, "_");
  assert(module->type != PE_MODULE);

  if (module->to_pe)
    lower_is_PE = 1;
  else
    lower_is_PE = 0;

  if (!lower_is_PE)
  {
    char *name = build_io_module_lower_name(module);
    p = isl_printer_print_str(p, name);
    free(name);
  }
  else
  {
    p = isl_printer_print_str(p, "PE");
  }
  p = isl_printer_print_str(p, "\");");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the lower body of the module call, including the 
 * fifos to the lower-level modules.
 */
static __isl_give isl_printer *print_module_call_lower(__isl_take isl_printer *p,
                                                       struct autosa_kernel_stmt *stmt, struct autosa_prog *prog, enum platform target)
{
  struct autosa_hw_module *module = stmt->u.m.module;
  int lower = stmt->u.m.lower;
  int first = 0;
  int n = isl_id_list_n_id(module->inst_ids);
  int lower_is_PE;
  int boundary = stmt->u.m.boundary;
  int serialize = stmt->u.m.serialize;

  if (lower)
  {
    struct autosa_array_ref_group *group = module->io_groups[0];

    p = print_delimiter(p, &first);
    p = print_fifo_annotation(p);
    if (serialize) {
      p = print_fifo_prefix(p, module, group);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_serialize\");");      
      p = isl_printer_end_line(p);
    } else {
      p = print_fifo_prefix_lower(p, module, group);
  
      if (module->to_pe)
        lower_is_PE = 1;
      else
        lower_is_PE = 0;
  
      if (group->io_type == AUTOSA_INT_IO && lower_is_PE && group->pe_io_dir == IO_INOUT)
      {
        /* Add in/out suffix. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
        p = isl_printer_print_str(p, module->in ? "_in" : "_out");
        p = isl_printer_print_str(p, "\");");
        p = isl_printer_end_line(p);
      }
  
      if (lower_is_PE) {
        p = print_pretrans_inst_ids_suffix(p, module->kernel->n_sa_dim,
                                           boundary ? group->io_pe_expr_boundary : group->io_pe_expr, 
                                           module->in || group->pe_io_dir != IO_INOUT? NULL : group->dir
                                           );
      } else {
        if (stmt->u.m.lower_sched_val != -1) {
          p = print_inst_ids_suffix(p, n, NULL);
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"_");
          p = isl_printer_print_int(p, stmt->u.m.lower_sched_val);
          p = isl_printer_print_str(p, "\");");
          p = isl_printer_end_line(p);        
        } else {
          p = print_inst_ids_suffix(p, n + 1, NULL);
        }
      }
    }
  }

  if (target != TAPA_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  if (target != TAPA_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  if (target != TAPA_HW)
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \");\");");
  else
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \")\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  return p;
}

/* Print out the module call instantionation in the private class fields for 
 * Catapult HLS.
 */
__isl_give isl_printer *autosa_kernel_print_module_call_inst(
  __isl_take isl_printer *p,
  struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
  enum platform target)
{
  int upper = stmt->u.m.upper;
  int lower = stmt->u.m.lower;
  int complete = (upper == 0 && lower == 0);
  int dummy = stmt->u.m.dummy;
  int boundary = stmt->u.m.boundary;
  int serialize = stmt->u.m.serialize;
  char *module_name = stmt->u.m.module_name;
  struct autosa_hw_module *module = stmt->u.m.module;

  if (dummy)
    return p;

  p = ppcg_start_block(p);

  if (complete || upper) {
    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
    
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
    p = isl_printer_print_str(p, module->name);
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");    
    if (serialize)
      p = isl_printer_print_str(p, "_serialize");    
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);
    
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \" \");");
    
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"");
    p = isl_printer_print_str(p, module->name);
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");    
    if (serialize)
      p = isl_printer_print_str(p, "_serialize");    
    p = isl_printer_print_str(p, "_inst");
    p = isl_printer_print_str(p, "\");");
    p = isl_printer_end_line(p);    

    /* Print the module ids if any */
    if (!dummy)
    {
      for (int i = 0; i < isl_id_list_n_id(module->inst_ids); i++)
      {                 
        p = print_str_new_line(p, "p = isl_printer_print_str(p, \"_\");");
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_int(p, c");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ");");        
        p = isl_printer_end_line(p);
      }
    }
    
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \";\");");
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  } 

  p = ppcg_end_block(p);

  return p;
}

/* Print out the module calls:
 * - module_call_upper
 * - module_call_lower
 */
__isl_give isl_printer *autosa_kernel_print_module_call(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
    enum platform target)
{
  int upper = stmt->u.m.upper;
  int lower = stmt->u.m.lower;
  int complete = (upper == 0 && lower == 0);
  int dummy = stmt->u.m.dummy;
  int boundary = stmt->u.m.boundary;
  int serialize = stmt->u.m.serialize;
  char *module_name = stmt->u.m.module_name;
  struct autosa_hw_module *module = stmt->u.m.module;
  p = ppcg_start_block(p);

  /* Build the module name. */
  if (complete)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "// Count module number");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module_name);
    if (boundary)
      p = isl_printer_print_str(p, "_boundary");
    p = isl_printer_print_str(p, "_cnt++;");
    p = isl_printer_end_line(p);
    if (module->is_filter && module->is_buffer)
    {
      /* Print counter for inter_trans and intra_trans module. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, module_name);
      p = isl_printer_print_str(p, "_intra_trans_cnt++;");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, module_name);
      if (boundary)
        p = isl_printer_print_str(p, "_inter_trans_boundary_cnt++;");
      else
        p = isl_printer_print_str(p, "_inter_trans_cnt++;");
      p = isl_printer_end_line(p);
    }

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* Module Call */\");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
    p = isl_printer_end_line(p);

    p = print_module_call_upper(p, stmt, prog, target);
    p = print_module_call_lower(p, stmt, prog, target);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* Module Call */\");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
    p = isl_printer_end_line(p);
  }
  else
  {
    if (upper)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "// Count module number");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, module_name);
      if (boundary)
        p = isl_printer_print_str(p, "_boundary");
      if (serialize)        
        p = isl_printer_print_str(p, "_serialize");
      p = isl_printer_print_str(p, "_cnt++;");
      p = isl_printer_end_line(p);
      if (module->is_filter && module->is_buffer && !serialize)
      {
        /* Print counter for inter_trans and intra_trans module */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module_name);
        p = isl_printer_print_str(p, "_intra_trans_cnt++;");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module_name);
        if (boundary)
          p = isl_printer_print_str(p, "_inter_trans_boundary_cnt++;");
        else
          p = isl_printer_print_str(p, "_inter_trans_cnt++;");
        p = isl_printer_end_line(p);
      }

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* Module Call */\");");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
      p = isl_printer_end_line(p);

      p = print_module_call_upper(p, stmt, prog, target);
    }
    else
    {
      p = print_module_call_lower(p, stmt, prog, target);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* Module Call */\");");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
      p = isl_printer_end_line(p);
    }
  }

  p = ppcg_end_block(p);

  return p;
}

/* If read, print:
 *   "[fifo_name].read()"
 * else, print:
 *   "[fifo_name].write("
 */
__isl_give isl_printer *print_fifo_rw_xilinx(__isl_take isl_printer *p,
                                             const char *fifo_name, int read)
{
  if (read)
  {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".read()");
  }
  else
  {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".write(");
  }
  return p;
}

__isl_give isl_printer *print_fifo_rw_catapult(
  __isl_take isl_printer *p, const char *fifo_name, int read)
{
  if (read) {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".read()");
  } else {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".write(");
  }
  return p;
}

/* If read, print:
 *   "read_channel_intel([fifo_name])"
 * else, print:
 *   "write_channel_intel([fifo_name])"
 */
__isl_give isl_printer *print_fifo_rw_intel(__isl_take isl_printer *p,
                                            const char *fifo_name, int read)
{
  if (read)
  {
    p = isl_printer_print_str(p, "read_channel_intel(");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ")");
  }
  else
  {
    p = isl_printer_print_str(p, "write_channel_intel(");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ", ");
  }
  return p;
}

__isl_give isl_printer *print_fifo_rw_tapa(
  __isl_take isl_printer *p, const char *fifo_name, int read)
{
  if (read) {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".read()");
  } else {
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ".write(");
  }
  return p;
}

/* Print an I/O statement.
 *
 * An in I/O statement is printed as
 *
 *  local[] = fifo.read(); 
 *
 * while an out I/O statement is printed as
 *
 *  fifo.write(local);
 */
__isl_give isl_printer *autosa_kernel_print_io(__isl_take isl_printer *p,
                                               struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.i.module;
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_kernel *kernel = module->kernel;
  char *fifo_name;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  int is_dummy = stmt->u.i.dummy;
  fifo_name = concat(ctx, stmt->u.i.in_fifo_name, stmt->u.i.in == 1 ? "in" : "out");
  int data_pack = stmt->u.i.data_pack;  

  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  if (is_dummy)  
  {
    if (stmt->u.i.in) {
      /* [type] fifo_data; */
      p = isl_printer_start_line(p);
      if (is_sparse) {
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack);
      } else {        
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, data_pack);        
      }
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);

      /* fifo_data = fifo.read(); */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 1);
      else if (hls->target == CATAPULT_HW)  
        p = print_fifo_rw_catapult(p, fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      free(fifo_name);
      return p;
    } else {
      /* Send zeros by default, might be buggy. */      
      /* [type] fifo_data = 0; */
      p = isl_printer_start_line(p);      
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, data_pack);      
      p = isl_printer_print_str(p, " fifo_data = 0;");
      p = isl_printer_end_line(p);
      
      /* fifo.write(fifo_data); */
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");
      p = isl_printer_end_line(p);

      free(fifo_name);
      return p;      
    }
  }

  int nxt_data_pack = stmt->u.i.nxt_data_pack;
  isl_ast_expr *local_index_packed;
  isl_ast_expr *arg, *div;
  int n_arg;
  local_index_packed = isl_ast_expr_copy(stmt->u.i.local_index);
  /* Modify the local index. */
  if (data_pack > 1)
  {
    n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
    arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
    div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, data_pack));
    arg = isl_ast_expr_div(arg, div);
    local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
  }

  if (data_pack == nxt_data_pack && !group->local_array->is_sparse)
  {
    // TODO: modify the sparse

    /* local[] = fifo.read() */
    p = isl_printer_start_line(p);
    if (stmt->u.i.in)
    {
      p = isl_printer_print_ast_expr(p, local_index_packed);
      p = isl_printer_print_str(p, " = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 1);
    }
    else
    {
      /* fifo.write(local[]) */
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)  
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      p = isl_printer_print_ast_expr(p, local_index_packed);
      p = isl_printer_print_str(p, ")");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  } 
  else
  {
    p = ppcg_start_block(p);
    if (!kernel->sparse) {
      /* [type] fifo_data; */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, group->array->name);    
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, data_pack);
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);
    }    

    if (kernel->sparse && is_sparse == 0 && stmt->u.i.in) {
      /* [type] tmp_X[]; */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, group->array->type);      
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_tmp[1][");
      p = isl_printer_print_int(p, group->n_lane);
      p = isl_printer_print_str(p, "];");
      p = isl_printer_end_line(p);

      if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_tmp dim=0 complete");
        p = isl_printer_end_line(p);
      }
    }

    if (stmt->u.i.in)
    {
      /* fifo_data = fifo.read(); */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data");
      if (kernel->sparse) {
        p = isl_printer_print_str(p, "_");
        p = isl_printer_print_str(p, group->array->name);    
      }
      p = isl_printer_print_str(p, " = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
      if (kernel->sparse) {        
        /* [type] fifo_data = fifo_data_X; */
        if (is_sparse) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_s_t");
          p = isl_printer_print_int(p, group->n_lane);
          p = isl_printer_print_str(p, " fifo_data = fifo_data_");
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);          
        } else {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, group->n_lane);
          p = isl_printer_print_str(p, " fifo_data = fifo_data_");
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);          
        }
      }

      if (hls->target == XILINX_HW)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int n = 0; n < ");
        if (is_sparse)
          p = isl_printer_print_int(p, group->n_lane * n_nzero);  
        else
          p = isl_printer_print_int(p, data_pack / nxt_data_pack);
        p = isl_printer_print_str(p, "; n++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS UNROLL");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
        isl_ast_expr *op;
        isl_ast_expr *expr = stmt->u.i.local_index;
        int n_arg = isl_ast_expr_op_get_n_arg(expr);
        /* Union */
        if (nxt_data_pack == 1)
        {
          /* union {unsigned int ui; float ut;} u; */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "union {unsigned int ui; ");
          p = isl_printer_print_str(p, group->array->type);
          p = isl_printer_print_str(p, " ut;} u;");
          p = isl_printer_end_line(p);
          /* u.ui = (unsigned int)fifo_data(32*next_data_pack - 1, 0); */
          p = isl_printer_start_line(p);
          if (kernel->sparse) {
            if (is_sparse) 
              p = isl_printer_print_str(p, "u.ui = (unsigned int)fifo_data.d(");
            else
              p = isl_printer_print_str(p, "u.ui = (unsigned int)fifo_data(");
          } else
            p = isl_printer_print_str(p, "u.ui = (unsigned int)fifo_data(");
          p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack - 1);
          p = isl_printer_print_str(p, ", 0);");
          p = isl_printer_end_line(p);
        }
        /* local[][n] = u.ut; or 
         * local[][n] = fifo_data(32*nxt_data_pack - 1, 0);
         */
        p = isl_printer_start_line(p);
        op = isl_ast_expr_op_get_arg(expr, 0);        
        if (kernel->sparse && group->local_array->is_sparse == 0 && group->local_array->array_type == AUTOSA_EXT_ARRAY) {
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_tmp");
        } else {
          p = isl_printer_print_ast_expr(p, op); // array_name
        }

        isl_ast_expr_free(op);
        for (int i = 0; i < n_arg - 1; i++)
        {
          op = isl_ast_expr_op_get_arg(expr, 1 + i);
          p = isl_printer_print_str(p, "[");
          if (i == n_arg - 2)
          {
            if (stmt->u.i.simd_depth != -1) {
              //DBGASTEXPR(stdout, op, ctx);
              p = isl_printer_print_ast_expr(p, op);
              p = isl_printer_print_str(p, " + n");
            } else {
              p = isl_printer_print_str(p, "n");
            }
          }
          else
          {
            p = isl_printer_print_ast_expr(p, op);
          }
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }
        p = isl_printer_print_str(p, " = ");
        if (nxt_data_pack == 1)
        {
          p = isl_printer_print_str(p, "u.ut;");
          p = isl_printer_end_line(p);
        }
        else
        {
          p = isl_printer_print_str(p, "fifo_data(");
          p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack - 1);
          p = isl_printer_print_str(p, ", 0)");
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
        /* fifo_data = fifo_data >> 32*nxt_data_pack; */
        p = isl_printer_start_line(p);
        if (is_sparse)
          p = isl_printer_print_str(p, "fifo_data.d = fifo_data.d >> ");
        else
          p = isl_printer_print_str(p, "fifo_data = fifo_data >> ");            
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "}");
        p = isl_printer_end_line(p);
      }
      else if (hls->target == INTEL_HW)
      {
        isl_ast_expr *op;
        isl_ast_expr *expr = stmt->u.i.local_index;
        int n_arg = isl_ast_expr_op_get_n_arg(expr);
        for (int i = 0; i < data_pack / nxt_data_pack; i++)
        {
          /* local[][n] = fifo_data.sxxxx; */
          p = isl_printer_start_line(p);
          op = isl_ast_expr_op_get_arg(expr, 0);
          p = isl_printer_print_ast_expr(p, op); // array_name
          isl_ast_expr_free(op);
          for (int j = 0; j < n_arg - 1; j++)
          {
            op = isl_ast_expr_op_get_arg(expr, 1 + j);
            p = isl_printer_print_str(p, "[");
            if (j == n_arg - 2)
            {
              p = isl_printer_print_int(p, i);
            }
            else
            {
              p = isl_printer_print_ast_expr(p, op);
            }
            p = isl_printer_print_str(p, "]");
            isl_ast_expr_free(op);
          }
          if (nxt_data_pack > 1)
            p = isl_printer_print_str(p, ".data");
          p = isl_printer_print_str(p, " = fifo_data.data.s");
          for (int j = 0; j < nxt_data_pack; j++)
          {
            p = isl_printer_print_str(p, vector_index[j + i * nxt_data_pack]);
          }
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      } else if (hls->target == CATAPULT_HW) {
        p = print_str_new_line(p, "#pragma unroll yes");
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int n = 0; n < ");
        if (is_sparse)
          p = isl_printer_print_int(p, group->n_lane * n_nzero);  
        else
          p = isl_printer_print_int(p, data_pack / nxt_data_pack);
        p = isl_printer_print_str(p, "; n++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
        isl_ast_expr *op;
        isl_ast_expr *expr = stmt->u.i.local_index;
        int n_arg = isl_ast_expr_op_get_n_arg(expr);
        /* local[][n] = fifo_data.slc(); */
        p = isl_printer_start_line(p);
        op = isl_ast_expr_op_get_arg(expr, 0);        
        if (kernel->sparse && group->local_array->is_sparse == 0 && group->local_array->array_type == AUTOSA_EXT_ARRAY) {
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_tmp");
        } else {
          p = isl_printer_print_ast_expr(p, op); // array_name
        }
        isl_ast_expr_free(op);
        for (int i = 0; i < n_arg - 1; i++) {
          op = isl_ast_expr_op_get_arg(expr, 1 + i);
          p = isl_printer_print_str(p, "[");
          if (i == n_arg - 2)
          {
            if (stmt->u.i.simd_depth != -1) {
              //DBGASTEXPR(stdout, op, ctx);
              p = isl_printer_print_ast_expr(p, op);
              p = isl_printer_print_str(p, " + n");
            } else {
              p = isl_printer_print_str(p, "n");
            }
          }
          else
          {
            p = isl_printer_print_ast_expr(p, op);
          }
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }
        p = isl_printer_print_str(p, " = (");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, nxt_data_pack);
        p = isl_printer_print_str(p, ")fifo_data.slc<");
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack);
        p = isl_printer_print_str(p, ">(0);");        
        p = isl_printer_end_line(p);

        /* fifo_data = fifo_data >> xx * nxt_data_pack; */
        p = isl_printer_start_line(p);
        if (is_sparse)
          p = isl_printer_print_str(p, "fifo_data.d = fifo_data.d >> ");
        else
          p = isl_printer_print_str(p, "fifo_data = fifo_data >> ");      
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "}");
        p = isl_printer_end_line(p);  
      }
      else if (hls->target == TAPA_HW)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int n = 0; n < ");
        if (is_sparse)
          p = isl_printer_print_int(p, group->n_lane * n_nzero);
        else
          p = isl_printer_print_int(p, data_pack / nxt_data_pack);
        p = isl_printer_print_str(p, "; n++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS UNROLL");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
        isl_ast_expr *op;
        isl_ast_expr *expr = stmt->u.i.local_index;
        int n_arg = isl_ast_expr_op_get_n_arg(expr);
        /* local[][n] = fifo_data[n]; */
        p = isl_printer_start_line(p);
        op = isl_ast_expr_op_get_arg(expr, 0);
        if (kernel->sparse && group->local_array->is_sparse == 0 && group->local_array->array_type == AUTOSA_EXT_ARRAY) {
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_tmp");
        } else {
          p = isl_printer_print_ast_expr(p, op); // array_name
        }

        isl_ast_expr_free(op);
        for (int i = 0; i < n_arg - 1; i++) {
          op = isl_ast_expr_op_get_arg(expr, 1 + i);
          p = isl_printer_print_str(p, "[");
          if (i == n_arg - 2) {
            if (stmt->u.i.simd_depth != -1) {
              //DBGASTEXPR(stdout, op, ctx);
              p = isl_printer_print_ast_expr(p, op);
              p = isl_printer_print_str(p, " + n");
            } else {
              p = isl_printer_print_str(p, "n");
            }
          } else {
            p = isl_printer_print_ast_expr(p, op);
          }
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }
        p = isl_printer_print_str(p, " = ");
        if (nxt_data_pack == 1)
          p = isl_printer_print_str(p, "fifo_data[n];");
        else {
          p = isl_printer_print_str(p, "tapa::truncated<");
          p = isl_printer_print_int(p, nxt_data_pack);
          p = isl_printer_print_str(p, ">(fifo_data, ");
          p = isl_printer_print_int(p, nxt_data_pack);
          p = isl_printer_print_str(p, "* n)");
        }
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "}");
        p = isl_printer_end_line(p);
      }

      if (kernel->sparse && group->local_array->is_sparse == 0) {
        /* Print the extra data selection code. */        
        int index_s, index_w;
        int pos_w;

        p = isl_printer_start_line(p);
        index_w = (int)log2f((float)group->n_lane);
        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, index_w);
        } else if (hls->target == CATAPULT_HW) {
          p = isl_printer_print_str(p, "ac_int<");
          p = isl_printer_print_int(p, index_w);
          p = isl_printer_print_str(p, ", false");
        }
        p = isl_printer_print_str(p, "> index[");
        index_s = group->n_lane / kernel->vec_len * kernel->n_nzero;
        p = isl_printer_print_int(p, index_s);
        p = isl_printer_print_str(p, "];");
        p = isl_printer_end_line(p);

        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=index dim=0 complete");
          p = isl_printer_end_line(p);
        }

        //p = print_str_new_line(p, "unsigned char index = 0;");
        
        p = isl_printer_start_line(p);
        struct autosa_local_array_info *sparse_array;
        for (int i = 0; i < kernel->n_array; i++) {
          sparse_array = &kernel->array[i];
          if (sparse_array->is_sparse)
            break;
        }
        p = isl_printer_print_str(p, sparse_array->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, group->n_lane / kernel->vec_len);
        p = isl_printer_print_str(p, " ");        
        p = isl_printer_print_str(p, "s_tmp = fifo_data_");
        p = isl_printer_print_str(p, sparse_array->array->name);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        pos_w = (int)log2f((float)index_s);
        p = isl_printer_start_line(p);
        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, pos_w);
        } else if (hls->target == CATAPULT_HW) {
          p = isl_printer_print_str(p, "ac_int<");
          p = isl_printer_print_int(p, pos_w);          
          p = isl_printer_print_str(p, ", false");
        }
        p = isl_printer_print_str(p, "> pos = 0;");
        p = isl_printer_end_line(p);

        if (hls->target == CATAPULT_HW) {
          p = print_str_new_line(p, "#pragma unroll yes");
        }

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int n = 0; n < ");
        p = isl_printer_print_int(p, group->n_lane / kernel->vec_len);
        p = isl_printer_print_str(p, "; n++) {");
        p = isl_printer_end_line(p);

        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = print_str_new_line(p, "#pragma HLS UNROLL");
        }

        p = isl_printer_indent(p, 2);        
        p = print_str_new_line(p, "unsigned char offset = s_tmp.i(7, 0);");
        p = print_str_new_line(p, "s_tmp.i = s_tmp.i >> 8;");
        
        if (hls->target == CATAPULT_HW) {
          p = print_str_new_line(p, "#pragma unroll yes");
        }

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int m = 0; m < ");        
        p = isl_printer_print_int(p, kernel->vec_len);
        p = isl_printer_print_str(p, "; m++) {");
        p = isl_printer_end_line(p);
        
        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = print_str_new_line(p, "#pragma HLS UNROLL");
        }
        
        p = isl_printer_indent(p, 2);
        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = print_str_new_line(p, "if ((ap_uint<1>)(offset & 1) == (ap_uint<1>)1) {");
        } else if (hls->target == CATAPULT_HW) {
          p = print_str_new_line(p, "if ((ac_int<1, false>)(offset & 1) == (ac_int<1, false>)1) {");
        }
        p = isl_printer_indent(p, 2);
        
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "index[pos] = n * ");
        p = isl_printer_print_int(p, kernel->vec_len);
        p = isl_printer_print_str(p, " + m;");        
        p = isl_printer_end_line(p);

        p = print_str_new_line(p, "pos++;");

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
        p = print_str_new_line(p, "offset = offset >> 1;");
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        if (hls->target == CATAPULT_HW) {
          p = print_str_new_line(p, "#pragma unroll yes");
        }

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int n = 0; n < ");
        p = isl_printer_print_int(p, group->n_lane / kernel->vec_len * kernel->n_nzero);
        p = isl_printer_print_str(p, "; n++) {");
        p = isl_printer_end_line(p);

        if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
          p = print_str_new_line(p, "#pragma HLS UNROLL");
        }

        p = isl_printer_indent(p, 2);
        p = isl_printer_start_line(p);
        isl_ast_expr *op;
        isl_ast_expr *expr = stmt->u.i.local_index;
        int n_arg = isl_ast_expr_op_get_n_arg(expr);
        op = isl_ast_expr_op_get_arg(expr, 0);
        p = isl_printer_print_ast_expr(p, op); // array_name;
        isl_ast_expr_free(op);
        for (int i = 0; i < n_arg - 1; i++) {
          op = isl_ast_expr_op_get_arg(expr, 1 + i);
          p = isl_printer_print_str(p, "[");
          if (i == n_arg - 2) {
            if (stmt->u.i.simd_depth != -1) {
              p = isl_printer_print_ast_expr(p, op);
              p = isl_printer_print_str(p, " + n");
            } else {
              p = isl_printer_print_str(p, "n");
            }
          } else {
            p = isl_printer_print_ast_expr(p, op);
          }
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }
        p = isl_printer_print_str(p, " = ");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_tmp[0][index[n]];");
        p = isl_printer_end_line(p);
        
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }
    else
    {
      if (hls->target == XILINX_HW)
      {
        if (kernel->sparse) {
          p = isl_printer_start_line(p);
          p = print_fifo_rw_xilinx(p, fifo_name, 0);
          p = isl_printer_print_str(p, "fifo_data_");
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
        } else {
          if (nxt_data_pack == 1)
          {
            /* union {unsigned int ui; float ut;} u1, u0; */
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "union {unsigned int ui; ");
            p = isl_printer_print_str(p, group->array->type);
            p = isl_printer_print_str(p, " ut;} ");
            int first = 1;
            for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--)
            {
              if (!first)
                p = isl_printer_print_str(p, ", ");
              p = isl_printer_print_str(p, "u");
              p = isl_printer_print_int(p, i);
              first = 0;
            }
            p = isl_printer_print_str(p, ";");
            p = isl_printer_end_line(p);
            /* u1 = local[][1];
             * u0 = local[][0];
             */
            for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--)
            {
              isl_ast_expr *expr = stmt->u.i.local_index;
              isl_ast_expr *op;
              int n_arg = isl_ast_expr_op_get_n_arg(expr);
              p = isl_printer_start_line(p);
              p = isl_printer_print_str(p, "u");
              p = isl_printer_print_int(p, i);
              p = isl_printer_print_str(p, ".ut = ");
              op = isl_ast_expr_op_get_arg(expr, 0);
              p = isl_printer_print_ast_expr(p, op);
              isl_ast_expr_free(op);
              for (int j = 0; j < n_arg - 1; j++)
              {
                op = isl_ast_expr_op_get_arg(expr, 1 + j);
                p = isl_printer_print_str(p, "[");
                if (j == n_arg - 2)
                {
                  if (stmt->u.i.simd_depth != -1) {
                    p = isl_printer_print_ast_expr(p, op);
                    p = isl_printer_print_str(p, " + ");
                  }
                  p = isl_printer_print_int(p, i);
                }
                else
                {
                  p = isl_printer_print_ast_expr(p, op);
                }
                p = isl_printer_print_str(p, "]");
                isl_ast_expr_free(op);
              }
              p = isl_printer_print_str(p, ";");
              p = isl_printer_end_line(p);
            }
          }
          /* fifo_data = (ap_uint<32*nxt_data_pack>(u1.ui), 
           *              ap_uint<32*nxt_data_pack>(u0.ui)); */
          int first = 1;
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data = (");
          for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--)
          {
            isl_ast_expr *expr = stmt->u.i.local_index;
            isl_ast_expr *op;
            int n_arg = isl_ast_expr_op_get_n_arg(expr);
            if (!first)
              p = isl_printer_print_str(p, ", ");
            if (nxt_data_pack == 1)
            {
              p = isl_printer_print_str(p, "ap_uint<");
              p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack);
              p = isl_printer_print_str(p, ">(u");
              p = isl_printer_print_int(p, i);
              p = isl_printer_print_str(p, ".ui)");
            }
            else
            {
              op = isl_ast_expr_op_get_arg(expr, 0);
              p = isl_printer_print_ast_expr(p, op);
              isl_ast_expr_free(op);
              for (int j = 0; j < n_arg - 1; j++)
              {
                op = isl_ast_expr_op_get_arg(expr, 1 + j);
                p = isl_printer_print_str(p, "[");
                if (j == n_arg - 2)
                {
                  p = isl_printer_print_int(p, i);
                }
                else
                {
                  p = isl_printer_print_ast_expr(p, op);
                }
                p = isl_printer_print_str(p, "]");
                isl_ast_expr_free(op);
              }
            }
            first = 0;
          }
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
          p = print_fifo_rw_xilinx(p, fifo_name, 0);
          p = isl_printer_print_str(p, "fifo_data);");
          p = isl_printer_end_line(p);
        }
      }
      else if (hls->target == INTEL_HW)
      {
        /* fifo_data = (float4)((float2)local[][1], (float2)local[][0]); */
        int first = 1;
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "fifo_data.data = (");
        if (data_pack == 1)
        {
          p = isl_printer_print_str(p, group->array->type);
        }
        else
        {
          //p = isl_printer_print_str(p, group->array->name);
          //p = isl_printer_print_str(p, "_t");
          //p = isl_printer_print_int(p, data_pack);
          p = isl_printer_print_str(p, group->array->type);
          p = isl_printer_print_int(p, data_pack);
        }
        p = isl_printer_print_str(p, ")(");
        //for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--)
        for (int i = 0; i < data_pack / nxt_data_pack; i++)
        {
          isl_ast_expr *expr = stmt->u.i.local_index;
          isl_ast_expr *op;
          int n_arg = isl_ast_expr_op_get_n_arg(expr);
          if (!first)
            p = isl_printer_print_str(p, ", ");
          p = isl_printer_print_str(p, "(");
          if (nxt_data_pack == 1)
          {
            p = isl_printer_print_str(p, group->array->type);
          }
          else
          {
            p = isl_printer_print_str(p, group->array->name);
            p = isl_printer_print_str(p, "_t");
            p = isl_printer_print_int(p, nxt_data_pack);
          }
          p = isl_printer_print_str(p, ")");
          op = isl_ast_expr_op_get_arg(expr, 0);
          p = isl_printer_print_ast_expr(p, op);
          isl_ast_expr_free(op);
          for (int j = 0; j < n_arg - 1; j++)
          {
            op = isl_ast_expr_op_get_arg(expr, 1 + j);
            p = isl_printer_print_str(p, "[");
            if (j == n_arg - 2)
            {
              p = isl_printer_print_int(p, i);
            }
            else
            {
              p = isl_printer_print_ast_expr(p, op);
            }
            p = isl_printer_print_str(p, "]");
            isl_ast_expr_free(op);
            if (nxt_data_pack > 1)
              p = isl_printer_print_str(p, ".data");
          }
          first = 0;
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
        /* write_channel_intel(fifo, fifo_data); */
        p = isl_printer_start_line(p);
        p = print_fifo_rw_intel(p, fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_end_line(p);
      } else if (hls->target == CATAPULT_HW) {
        if (kernel->sparse) {
          p = isl_printer_start_line(p);
          p = print_fifo_rw_catapult(p, fifo_name, 0);          
          p = isl_printer_print_str(p, "fifo_data_");
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
        } else {          
          for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--) {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "fifo_data.set_slc(");
            p = isl_printer_print_int(p, group->array->size * 8 * nxt_data_pack * i);
            p = isl_printer_print_str(p, ", ");

            isl_ast_expr *expr = stmt->u.i.local_index;
            isl_ast_expr *op;
            int n_arg = isl_ast_expr_op_get_n_arg(expr);
            op = isl_ast_expr_op_get_arg(expr, 0);
            p = isl_printer_print_ast_expr(p, op);
            isl_ast_expr_free(op);
            for (int j = 0; j < n_arg - 1; j++)
            {
              op = isl_ast_expr_op_get_arg(expr, 1 + j);
              p = isl_printer_print_str(p, "[");
              if (j == n_arg - 2)
              {
                p = isl_printer_print_int(p, i);
              }
              else
              {
                p = isl_printer_print_ast_expr(p, op);
              }
              p = isl_printer_print_str(p, "]");
              isl_ast_expr_free(op);
            }
            p = isl_printer_print_str(p, ");");
            p = isl_printer_end_line(p);
          }

          p = isl_printer_start_line(p);
          p = print_fifo_rw_catapult(p, fifo_name, 0);
          p = isl_printer_print_str(p, "fifo_data);");
          p = isl_printer_end_line(p);
        }
      } else if (hls->target == TAPA_HW) {
        if (kernel->sparse) {
          p = isl_printer_start_line(p);
          p = print_fifo_rw_tapa(p, fifo_name, 0);
          p = isl_printer_print_str(p, "fifo_data_");
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
        } else {
          if (nxt_data_pack == 1)
          {
            /* float f1, f0; */
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, group->array->type);
            p = isl_printer_print_str(p, " ");
            int first = 1;
            for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--) {
              if (!first)
                p = isl_printer_print_str(p, ", ");
              p = isl_printer_print_str(p, "f");
              p = isl_printer_print_int(p, i);
              first = 0;
            }
            p = isl_printer_print_str(p, ";");
            p = isl_printer_end_line(p);
            /* f1 = local[][1];
             * f0 = local[][0]; */
            for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--)
            {
              isl_ast_expr *expr = stmt->u.i.local_index;
              isl_ast_expr *op;
              int n_arg = isl_ast_expr_op_get_n_arg(expr);
              p = isl_printer_start_line(p);
              p = isl_printer_print_str(p, "f");
              p = isl_printer_print_int(p, i);
              p = isl_printer_print_str(p, " = ");
              op = isl_ast_expr_op_get_arg(expr, 0);
              p = isl_printer_print_ast_expr(p, op);
              isl_ast_expr_free(op);
              for (int j = 0; j < n_arg - 1; j++) {
                op = isl_ast_expr_op_get_arg(expr, 1 + j);
                p = isl_printer_print_str(p, "[");
                if (j == n_arg - 2) {
                  if (stmt->u.i.simd_depth != -1) {
                    p = isl_printer_print_ast_expr(p, op);
                    p = isl_printer_print_str(p, " + ");
                  }
                  p = isl_printer_print_int(p, i);
                } else {
                  p = isl_printer_print_ast_expr(p, op);
                }
                p = isl_printer_print_str(p, "]");
                isl_ast_expr_free(op);
              }
              p = isl_printer_print_str(p, ";");
              p = isl_printer_end_line(p);
            }
          }
          /* fifo_data = [f1, f0]; */
          for (int i = data_pack / nxt_data_pack - 1; i >= 0; i--) {
            isl_ast_expr *expr = stmt->u.i.local_index;
            isl_ast_expr *op;
            int n_arg = isl_ast_expr_op_get_n_arg(expr);
            if (nxt_data_pack == 1) {
              p = isl_printer_start_line(p);
              p = isl_printer_print_str(p, "fifo_data.set(");
              p = isl_printer_print_int(p, i);
              p = isl_printer_print_str(p, ", f");
              p = isl_printer_print_int(p, i);
              p = isl_printer_print_str(p, ");");
              p = isl_printer_end_line(p);
            } else {
              for (int j = 0; j < nxt_data_pack; j++) {
                p = isl_printer_start_line(p);
                p = isl_printer_print_str(p, "fifo_data.set(");
                p = isl_printer_print_int(p, i * nxt_data_pack + j);
                p = isl_printer_print_str(p, ", ");

                op = isl_ast_expr_op_get_arg(expr, 0);
                p = isl_printer_print_ast_expr(p, op);
                isl_ast_expr_free(op);

                for (int k = 0; k < n_arg - 1; k++) {
                  op = isl_ast_expr_op_get_arg(expr, 1 + k);
                  p = isl_printer_print_str(p, "[");
                  if (k == n_arg - 2) {
                    p = isl_printer_print_int(p, i);
                  } else {
                    p = isl_printer_print_ast_expr(p, op);
                  }
                  p = isl_printer_print_str(p, "]");
                  isl_ast_expr_free(op);
                }

                p = isl_printer_print_str(p, "[");
                p = isl_printer_print_int(p, j);
                p = isl_printer_print_str(p, "]);");
                p = isl_printer_end_line(p);
              }
            }
          }
          p = isl_printer_start_line(p);
          p = print_fifo_rw_tapa(p, fifo_name, 0);
          p = isl_printer_print_str(p, "fifo_data);");
          p = isl_printer_end_line(p);
        }
      }
    }
    p = ppcg_end_block(p);
  }
  
  free(fifo_name);
  isl_ast_expr_free(local_index_packed);  
  return p;
}

__isl_give isl_printer *autosa_print_reduce_data_pack(
  __isl_take isl_printer *p,
  struct autosa_kernel_stmt *stmt,
  int data_pack_in,
  int data_pack_out,
  struct autosa_array_ref_group *group,
  enum platform target
  )
{  
  p = print_str_new_line(p, "/* Local Reduction */");

  if (target == XILINX_HW) {
    /* union {unsigned int ui; data_t uf;} uin_0, uin_1, ... uout_0, uout_1, ...; */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "union {unsigned int ui; ");
    p = isl_printer_print_str(p, group->array->type);
    p = isl_printer_print_str(p, " ut;} ");
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ", ");
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      if (i == data_pack_in - 1) {
        p = isl_printer_print_str(p, ";");
      } else {
        p = isl_printer_print_str(p, ", ");
      }
    }
    p = isl_printer_end_line(p);

    /* assign the fifo_data and buf_data_split[split_i] to union vars. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);
      if (data_pack_in == 1) {
        p = isl_printer_print_str(p, ".ut = in_data;");
      } else {
        p = isl_printer_print_str(p, ".ui = (unsigned int)in_data(");
        p = isl_printer_print_int(p, group->array->size * 8 * (i + 1) - 1);
        p = isl_printer_print_str(p, ", ");
        p = isl_printer_print_int(p, group->array->size * 8 * i);
        p = isl_printer_print_str(p, ");");
      }
      p = isl_printer_end_line(p);
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);    
      p = isl_printer_print_str(p, ".ui = (unsigned int)data_split[split_idx](");
      p = isl_printer_print_int(p, group->array->size * 8 * (i + 1) - 1);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_int(p, group->array->size * 8 * i);
      p = isl_printer_print_str(p, ");");    
      p = isl_printer_end_line(p);
    }

    /* perform reduction. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ".ut ");
      p = isl_printer_print_str(p, stmt->u.i.reduce_op);
      p = isl_printer_print_str(p, "= ");
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ".ut;");
      p = isl_printer_end_line(p);
    }

    /* re-assign the reduced values to the buf_data_split[i]. */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "data_split[split_idx] = ");
    p = isl_printer_print_str(p, "(");
    for (int i = data_pack_in - 1; i >= 0; i--) {    
      if (i != data_pack_in - 1)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "(ap_uint<");
      p = isl_printer_print_int(p, group->array->size * 8);
      p = isl_printer_print_str(p, ">)");
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ".ui");
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);
  } else if (target == CATAPULT_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, group->array->name);
    p = isl_printer_print_str(p, "_t1 ");
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ", ");
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      if (i == data_pack_in - 1) {
        p = isl_printer_print_str(p, ";");
      } else {
        p = isl_printer_print_str(p, ", ");
      }
    }
    p = isl_printer_end_line(p);

    /* assign the fifo_data and buf_data_split[split_i] to vars. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);      
      if (data_pack_in == 1) {
        p = isl_printer_print_str(p, " = in_data");      
      } else {
        p = isl_printer_print_str(p, " = in_data.slc<");
        p = isl_printer_print_int(p, group->array->size * 8);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, group->array->size * 8 * i);
        p = isl_printer_print_str(p, ");");
      }
      p = isl_printer_end_line(p);
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, " = data_split[split_idx].slc<");
      p = isl_printer_print_int(p, group->array->size * 8);
      p = isl_printer_print_str(p, ">(");
      p = isl_printer_print_int(p, group->array->size * 8 * i);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);
    }

    /* perform reduction */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "uout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, stmt->u.i.reduce_op);
      p = isl_printer_print_str(p, "= ");
      p = isl_printer_print_str(p, "uin_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    /* re-assign the reduced values to the buf_data_split[i]. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[split_idx].set_slc(");
      p = isl_printer_print_int(p, group->array->size * 8 * i);
      p = isl_printer_print_str(p, ", uout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ")");
    }    
  } else if (target == TAPA_HW) {
    /* data_t din_0, din_1, ... dout_0, dout_1, ...; */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, group->array->type);
    p = isl_printer_print_str(p, " ");
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "din_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ", ");
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_print_str(p, "dout_");
      p = isl_printer_print_int(p, i);
      if (i == data_pack_in - 1) {
        p = isl_printer_print_str(p, ";");
      } else {
        p = isl_printer_print_str(p, ", ");
      }
    }
    p = isl_printer_end_line(p);

    /* assign the fifo_data and buf_data_split[split_i] into vars. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "din_");
      p = isl_printer_print_int(p, i);
      if (data_pack_in == 1) {
        p = isl_printer_print_str(p, " = in_data;");
      } else {
        p = isl_printer_print_str(p, " = in_data[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "];");
      }
      p = isl_printer_end_line(p);
    }
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, " = data_split[split_idx]");
      if (data_pack_in > 1) {
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "]");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    /* perform reduction. */
    for (int i = 0; i < data_pack_in; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dout_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, stmt->u.i.reduce_op);
      p = isl_printer_print_str(p, "= ");
      p = isl_printer_print_str(p, "din_");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    /* re-assign the reduced values to the buf_data_split[i]. */
    for (int i = data_pack_in - 1; i >= 0; i--) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[split_idx]");
      if (data_pack_in > 1) {
        p = isl_printer_print_str(p, ".set(");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ", ");
      } else {
        p = isl_printer_print_str(p, " = ");
      }
      p = isl_printer_print_str(p, "dout_");
      p = isl_printer_print_int(p, i);
      if (data_pack_in > 1) {
        p = isl_printer_print_str(p, ")");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
  }

  p = print_str_new_line(p, "/* Local Reduction */");

  return p;
}

__isl_give isl_printer *autosa_print_reduce_default(
  __isl_take isl_printer *p,
  struct autosa_kernel_stmt *stmt,
  int data_pack,
  isl_ast_expr *index,
  struct autosa_array_ref_group *group)
{
  p = print_str_new_line(p, "/* Local Reduction */");

  /* union {unsigned int ui; data_t ut;} u... */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "union {unsigned int ui; ");
  p = isl_printer_print_str(p, group->array->type);
  p = isl_printer_print_str(p, " ut;} ");
  for (int i = 0; i < data_pack; i++) {
    p = isl_printer_print_str(p, "uin_");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, ", ");
  }
  for (int i = 0; i < data_pack; i++) {
    p = isl_printer_print_str(p, "uout_");
    p = isl_printer_print_int(p, i);
    if (i == data_pack - 1) {
      p = isl_printer_print_str(p, ";");
    } else {
      p = isl_printer_print_str(p, ", ");
    }
  }
  p = isl_printer_end_line(p);

  /* assign fifo_data to uxx, assign local_data to uxx. */
  for (int i = 0; i < data_pack; i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "uin_");
    p = isl_printer_print_int(p, i);
    if (data_pack == 1) {
      p = isl_printer_print_str(p, ".ut = in_data;");
    } else {
      p = isl_printer_print_str(p, ".ui = (unsigned int)in_data(");
      p = isl_printer_print_int(p, group->array->size * 8 * (i + 1) - 1);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_int(p, group->array->size * 8 * i);
      p = isl_printer_print_str(p, ");");
    }
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data_pack; i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "uout_");
    p = isl_printer_print_int(p, i);    
    if (data_pack == 1) {
      p = isl_printer_print_str(p, ".ut = ");
      if (stmt->u.i.module->double_buffer &&
          stmt->u.i.module->options->autosa->double_buffer_style == 0)
        throw std::runtime_error("[AutoSA] Error: Local reduce for double buffer style 0 is not supported!");
      else {        
        p = isl_printer_print_ast_expr(p, index);
      }
      p = isl_printer_print_str(p, ";");      
    } else {
      p = isl_printer_print_str(p, ".ui = (unsigned int)");
      p = isl_printer_print_ast_expr(p, index);
      p = isl_printer_print_str(p, "(");
      p = isl_printer_print_int(p, group->array->size * 8 * (i + 1) - 1);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_int(p, group->array->size * 8 * i);
      p = isl_printer_print_str(p, ");");
    }
    p = isl_printer_end_line(p);
  }

  /* perform reduction. */
  for (int i = 0; i < data_pack; i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "uout_");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, ".ut ");
    p = isl_printer_print_str(p, stmt->u.i.reduce_op);
    p = isl_printer_print_str(p, "= ");
    p = isl_printer_print_str(p, "uin_");
    p = isl_printer_print_int(p, i);
    p = isl_printer_print_str(p, ".ut;");
    p = isl_printer_end_line(p);
  }

  /* reassign uxx to local[][] */
  p = isl_printer_start_line(p);
  //p = isl_printer_print_ast_expr(p, index);
  p = isl_printer_print_str(p, "out_data");
  p = isl_printer_print_str(p, " = ");
  if (data_pack == 1) {
    p = isl_printer_print_str(p, "uout_0.ut;");    
  } else {
    p = isl_printer_print_str(p, "(");
    int is_first = 1;
    for (int i = data_pack - 1; i >= 0; i--) {
      if (!is_first)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "(ap_uint<");
      p = isl_printer_print_int(p, group->array->size * 8);
      p = isl_printer_print_str(p, ">)uout_");
      p = isl_printer_print_int(p, i);   
      p = isl_printer_print_str(p, ".ui");
      is_first = 0;
    }
    p = isl_printer_print_str(p, ");");
  }
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "/* Local Reduction */");

  return p;
}

/* Print an I/O transfer statement.
 *
 * An in I/O statement is printed as
 *
 *  [type] fifo_data;
 *  fifo_data = fifo.read();
 *  if (filter_condition) {
 *    local[] = fifo_data; // if buf == 1
 *    fifo_local.write(fifo_data); // if buf == 0
 *  } else {
 *    fifo.write(fifo_data);
 *  }
 *
 * if filter_depth < 0
 *
 *  [type] fifo_data;
 *  fifo_data = fifo.read();
 *  local = fifo_data; // if buf == 1
 *  fifo_local.write(fifo_data); // if buf == 0
 *
 * An out I/O statement is printed as 
 *
 *  [type] fifo_data;
 *  fifo_data = fifo.read();
 *  if (filter_condition) {
 *    fifo_data = local[]; // if buf == 1
 *    fifo_data = fifo_local.read(); // if buf == 0
 *  } else {
 *    fifo_data = fifo.read();
 *  }
 *  fifo.write(fifo_data);
 */
static __isl_give isl_printer *autosa_kernel_print_io_transfer_default(
    __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,
    struct autosa_array_ref_group *group, int n_lane, struct hls_info *hls,
    const char *iterator_prefix)
{
  isl_ctx *ctx;
  char *fifo_name;
  ctx = isl_printer_get_ctx(p);
  int boundary = stmt->u.i.boundary;
  /* If the statement is a boundary statement, 
   * then ignore the filter condition by setting filter_sched_depth as -1
   */
  if (boundary)
    stmt->u.i.filter_sched_depth = -1;

  isl_ast_expr *local_index_packed;
  isl_ast_expr *arg, *div;
  local_index_packed = isl_ast_expr_copy(stmt->u.i.local_index);
  int n_arg;
  /* Extract the sparse data */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  /* Modify the local index. */
  if (is_sparse) {
    n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
    arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
    div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, vec_len * n_lane));
    arg = isl_ast_expr_div(arg, div);
    local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
  } else {
    if (n_lane > 1)
    {
      n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
      arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, n_lane));
      arg = isl_ast_expr_div(arg, div);
      local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
    }
  }

  /* Declare the fifo data variable. */
  p = isl_printer_start_line(p);
  if (is_sparse) {
    p = autosa_print_array_type_with_lane_sparse(p, group->array, n_lane);
  } else {    
    p = isl_printer_print_str(p, stmt->u.i.array->name);
    if (group->local_array->is_sparse)
      p = isl_printer_print_str(p, "_s");
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);    
  }
  p = isl_printer_print_str(p, " fifo_data;");
  p = isl_printer_end_line(p);

  if (stmt->u.i.in)
  {            
    fifo_name = concat(ctx, stmt->u.i.in_fifo_name, "in");
    /* fifo_data = fifo.read(); */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fifo_data");
    p = isl_printer_print_str(p, " = ");
    if (hls->target == XILINX_HW)
      p = print_fifo_rw_xilinx(p, fifo_name, 1);
    else if (hls->target == TAPA_HW)
      p = print_fifo_rw_tapa(p, fifo_name, 1);
    else if (hls->target == INTEL_HW)
      p = print_fifo_rw_intel(p, fifo_name, 1);
    else if (hls->target == CATAPULT_HW)
      p = print_fifo_rw_catapult(p, fifo_name, 1);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
    free(fifo_name);

    if (stmt->u.i.buf)
    {
      /* local[][] = fifo_data; */
      if (stmt->u.i.reduce) {
        p = autosa_print_reduce_default(p, stmt, n_lane, local_index_packed, group);
      } else {
        p = isl_printer_start_line(p);
        //p = isl_printer_print_ast_expr(p, local_index_packed);
        if (stmt->u.i.module->double_buffer && 
            stmt->u.i.module->options->autosa->double_buffer_style == 0)
        {
          isl_ast_expr *op;
          op = isl_ast_expr_op_get_arg(local_index_packed, 0);
          p = isl_printer_print_ast_expr(p, op);
          isl_ast_expr_free(op);
          p = isl_printer_print_str(p, "[arb]");
          for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
            op = isl_ast_expr_op_get_arg(local_index_packed, n);
            p = isl_printer_print_str(p, "[");
            p = isl_printer_print_ast_expr(p, op);
            p = isl_printer_print_str(p, "]");
            isl_ast_expr_free(op);
          }
        } 
        else 
        {
          if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
            isl_ast_expr *op;
            op = isl_ast_expr_op_get_arg(local_index_packed, 0);
            p = isl_printer_print_ast_expr(p, op);    
            isl_ast_expr_free(op);
            p = isl_printer_print_str(p, "_tmp.data");
            for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
              op = isl_ast_expr_op_get_arg(local_index_packed, n);
              p = isl_printer_print_str(p, "[");
              p = isl_printer_print_ast_expr(p, op);
              p = isl_printer_print_str(p, "]");
              isl_ast_expr_free(op);
            }
          } else {
            p = isl_printer_print_ast_expr(p, local_index_packed);
          }
        }
        p = isl_printer_print_str(p, " ");
        if (stmt->u.i.reduce) {        
          p = isl_printer_print_str(p, stmt->u.i.reduce_op);
          // TODO: what if the data pack factor is greater than 1?
        }         
        p = isl_printer_print_str(p, "= fifo_data;");
        p = isl_printer_end_line(p);
      }
    }
    else
    {
      /* fifo.write(fifo_data); */          
      fifo_name = concat(ctx, stmt->u.i.out_fifo_name, "out");      
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");
      p = isl_printer_end_line(p);
      free(fifo_name);
    }
  }
  else
  {    
    if (stmt->u.i.buf)
    {
      /* fifo_data = local[][]; */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (stmt->u.i.module->double_buffer && 
          stmt->u.i.module->options->autosa->double_buffer_style == 0) {      
        isl_ast_expr *op;
        op = isl_ast_expr_op_get_arg(local_index_packed, 0);
        p = isl_printer_print_ast_expr(p, op);
        isl_ast_expr_free(op);
        p = isl_printer_print_str(p, "[!arb]");
        for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
          op = isl_ast_expr_op_get_arg(local_index_packed, n);
          p = isl_printer_print_str(p, "[");
          p = isl_printer_print_ast_expr(p, op);
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }
      } else {
        if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
          isl_ast_expr *op;
          op = isl_ast_expr_op_get_arg(local_index_packed, 0);
          p = isl_printer_print_ast_expr(p, op);    
          isl_ast_expr_free(op);
          p = isl_printer_print_str(p, "_tmp.data");
          for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
            op = isl_ast_expr_op_get_arg(local_index_packed, n);
            p = isl_printer_print_str(p, "[");
            p = isl_printer_print_ast_expr(p, op);
            p = isl_printer_print_str(p, "]");
            isl_ast_expr_free(op);
          }
        } else {
          p = isl_printer_print_ast_expr(p, local_index_packed);
        }
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
    else
    {
      /* fifo_data = fifo.read(); */            
      fifo_name = concat(ctx, stmt->u.i.in_fifo_name, "in");      
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
      free(fifo_name);
    }

    /* fifo.write(fifo_data); */
    fifo_name = concat(ctx, stmt->u.i.out_fifo_name, "out");
    p = isl_printer_start_line(p);
    if (hls->target == XILINX_HW)
      p = print_fifo_rw_xilinx(p, fifo_name, 0);
    else if (hls->target == TAPA_HW)
      p = print_fifo_rw_tapa(p, fifo_name, 0);
    else if (hls->target == INTEL_HW)
      p = print_fifo_rw_intel(p, fifo_name, 0);
    else if (hls->target == CATAPULT_HW)
      p = print_fifo_rw_catapult(p, fifo_name, 0);
    p = isl_printer_print_str(p, "fifo_data);");
    p = isl_printer_end_line(p);
    free(fifo_name);
  }

  isl_ast_expr_free(local_index_packed);

  return p;
}

/* Print an access to the element in the global memory copy
 * described by "stmt".  The index of the copy is recorded in
 * stmt->index as an access to the array.
 * If "serialize" is set, we will simply print array[i++];
 */
static __isl_give isl_printer *io_stmt_print_global_index(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt, int serialize)
{
  struct autosa_array_info *array = stmt->u.i.array;
  isl_ast_expr *index;

  if (autosa_array_is_scalar(array))
  {
    if (!autosa_array_is_read_only_scalar(array))
      p = isl_printer_print_str(p, "*");
    p = isl_printer_print_str(p, array->name);
    return p;
  }

  index = isl_ast_expr_copy(stmt->u.i.index);
  if (!serialize) {    
    p = isl_printer_print_ast_expr(p, index);
  } else {    
    isl_ast_expr *array_name;
    array_name = isl_ast_expr_op_get_arg(index, 0);
    p = isl_printer_print_ast_expr(p, array_name);
    p = isl_printer_print_str(p, "[i]");    
    isl_ast_expr_free(array_name);
  }
  isl_ast_expr_free(index);

  return p;
}

static __isl_give isl_printer *io_stmt_print_index_last_dim(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt, 
  int serialize, int global, int n_lane, int nxt_n_lane, int is_sparse, int vec_len)
{
  struct autosa_array_info *array = stmt->u.i.array;
  isl_ast_expr *index;

  if (autosa_array_is_scalar(array))
  {
    if (!autosa_array_is_read_only_scalar(array))
      p = isl_printer_print_str(p, "0");    
    return p;
  }

  if (global)
    index = isl_ast_expr_copy(stmt->u.i.index);
  else 
    index = isl_ast_expr_copy(stmt->u.i.local_index);

  if (!serialize) {    
    isl_ast_expr *op;
    int n_arg, r;
    isl_val *val;
    isl_ctx *ctx = isl_printer_get_ctx(p);

    n_arg = isl_ast_expr_op_get_n_arg(index);
    op = isl_ast_expr_op_get_arg(index, n_arg - 1);
    r = n_lane / nxt_n_lane;    
    if (is_sparse) 
      val = isl_val_int_from_si(ctx, vec_len * nxt_n_lane);
    else
      val = isl_val_int_from_si(ctx, nxt_n_lane);        
    op = isl_ast_expr_div(op, isl_ast_expr_from_val(val));        
    if (global) {
      op = isl_ast_expr_mul(op, isl_ast_expr_from_val(isl_val_int_from_si(ctx, n_lane)));
    }
    p = isl_printer_print_ast_expr(p, op);

    isl_ast_expr_free(op);    
  } else {        
    p = isl_printer_print_str(p, "i");        
  }
  isl_ast_expr_free(index);

  return p;  
}

/* A list of helper functions for autosa_kernel_print_io_transfer */
/* update_data_split: data_split[split_i] = in_data; */
static __isl_give isl_printer *io_transfer_update_data_split(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix)
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  if (hls->target == XILINX_HW ||
      hls->target == CATAPULT_HW ||
      hls->target == TAPA_HW ||
    (hls->target == INTEL_HW && nxt_n_lane > 1)) {
    if (stmt->u.i.reduce) {
      //if (n_lane == nxt_n_lane)
      //  p = autosa_print_reduce_default(p, stmt, n_lane, local_index_packed, group);
      //else
      p = autosa_print_reduce_data_pack(p, stmt, nxt_n_lane, n_lane, group, hls->target); // TODO
    } else {
      if (hls->target == XILINX_HW) {
        if (nxt_n_lane == 1) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "union {unsigned int ui; ");
          p = isl_printer_print_str(p, group->array->type);
          p = isl_printer_print_str(p, " ut;} u;");
          p = isl_printer_end_line(p);

          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "u.ut = in_data;");
          p = isl_printer_end_line(p);
        }
      }

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[split_idx] ");
      if (stmt->u.i.reduce) {
        p = isl_printer_print_str(p, stmt->u.i.reduce_op);
      }
      p = isl_printer_print_str(p, "= ");

      if (hls->target == XILINX_HW) {
        if (nxt_n_lane == 1) {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, group->array->size * 8);
          p = isl_printer_print_str(p, ">(u.ui);");
        } else {
          p = isl_printer_print_str(p, "in_data;");
        }
      } else {
        p = isl_printer_print_str(p, "in_data;");
      }
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

static __isl_give isl_printer *io_transfer_pack_out_data(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  if (hls->target == XILINX_HW) {
    int first = 1;
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "out_data = (");
    for (int i = n_lane / nxt_n_lane - 1; i >= 0; i--) {
      if (!first)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "data_split[");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, "]");
        first = 0;
    }
    p = isl_printer_print_str(p, ");");
  } else if (hls->target == INTEL_HW) {
    p = isl_printer_start_line(p);
    if (nxt_n_lane == 1) {
      p = isl_printer_print_str(p, "out_data.data[split_idx] = in_data;");
    } else {
      int first = 1;
      p = isl_printer_print_str(p, "out_data.data = ");
      p = isl_printer_print_str(p, "(");
      p = isl_printer_print_str(p, group->array->type);
      p = isl_printer_print_int(p, n_lane);
      p = isl_printer_print_str(p, ")(");
      for (int i = 0; i < n_lane / nxt_n_lane; i++) {
        if (!first)
          p = isl_printer_print_str(p, ", ");
        if (nxt_n_lane > 1) {
          p = isl_printer_print_str(p, "(");
          p = isl_printer_print_str(p, group->array->type);
          p = isl_printer_print_int(p, nxt_n_lane);
          p = isl_printer_print_str(p, ")");
        }
        p = isl_printer_print_str(p, "data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "]");
        if (nxt_n_lane > 1) {
          p = isl_printer_print_str(p, ".data");
        }
        first = 0;
      }
      p = isl_printer_print_str(p, ");");
    }
  } else if (hls->target == CATAPULT_HW) {
    for (int i = 0; i < n_lane / nxt_n_lane; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "out_data.set_slc(");
      p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ", data_split[");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, "]);");
      p = isl_printer_end_line(p);  
    }
  } else if (hls->target == TAPA_HW) {
    for (int i = 0; i < n_lane / nxt_n_lane; i++) {
      if (nxt_n_lane == 1) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "out_data.set(");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, ", data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "]);");
        p = isl_printer_end_line(p);
      } else {
        for (int j = 0; j < nxt_n_lane; j++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "out_data.set(");
          p = isl_printer_print_int(p, i * nxt_n_lane + j);
          p = isl_printer_print_str(p, ", data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "][");
          p = isl_printer_print_int(p, j);
          p = isl_printer_print_str(p, "]);");
          p = isl_printer_end_line(p);
        }
      }
    }
  }

  return p;
}

static __isl_give isl_printer *io_transfer_read_local_buf(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix, isl_ast_expr *local_index_packed) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "out_data = ");
  if (stmt->u.i.module->double_buffer && 
    stmt->u.i.module->options->autosa->double_buffer_style == 0) {
    isl_ast_expr *op;
    op = isl_ast_expr_op_get_arg(local_index_packed, 0);
    p = isl_printer_print_ast_expr(p, op);    
    isl_ast_expr_free(op);
    p = isl_printer_print_str(p, stmt->u.i.in? "[arb]" : "[!arb]");
    for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
      op = isl_ast_expr_op_get_arg(local_index_packed, n);
      p = isl_printer_print_str(p, "[");
      p = isl_printer_print_ast_expr(p, op);
      p = isl_printer_print_str(p, "]");
      isl_ast_expr_free(op);
    }
  } else {
    if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
      isl_ast_expr *op;
      op = isl_ast_expr_op_get_arg(local_index_packed, 0);
      p = isl_printer_print_ast_expr(p, op);    
      isl_ast_expr_free(op);
      p = isl_printer_print_str(p, "_tmp.data");
      for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
        op = isl_ast_expr_op_get_arg(local_index_packed, n);
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_ast_expr(p, op);
        p = isl_printer_print_str(p, "]");
        isl_ast_expr_free(op);
      }
    } else {
      p = isl_printer_print_ast_expr(p, local_index_packed);
    }
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  return p;  
}

static __isl_give isl_printer *io_transfer_parse_sparse_data(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  /* [type_n_lane] buf_data_d = buf_data.d; */
  p = isl_printer_start_line(p);
  p = autosa_print_array_type_with_lane(p, group->array, n_lane * n_nzero);
  p = isl_printer_print_str(p, " out_data_d = out_data.d;");
  p = isl_printer_end_line(p);

  /* [type_n_lane] buf_data_i = buf_data.i; */
  p = isl_printer_start_line(p);
  if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
    p = isl_printer_print_str(p, "ap_uint<");
    p = isl_printer_print_int(p, 8 * n_lane);
  } else if (hls->target == CATAPULT_HW) {
    p = isl_printer_print_str(p, "ac_int<");
    p = isl_printer_print_int(p, 8 * n_lane);
    p = isl_printer_print_str(p, ", false");
  }
  p = isl_printer_print_str(p, "> out_data_i = out_data.i;");
  p = isl_printer_end_line(p); 

  return p;
}

static __isl_give isl_printer *io_transfer_write_data_split(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix, const char *data_str) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  if (hls->target == XILINX_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int n = 0; n < ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, "; n++) {");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS UNROLL");
    p = isl_printer_end_line(p);    
    p = isl_printer_indent(p, 2);

    if (is_sparse) {
      /* data_split[n] = {out_data_d(), ...} */    
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[n] = (");
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_s_t");
      p = isl_printer_print_int(p, nxt_n_lane);
      p = isl_printer_print_str(p, "){");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_d(");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero - 1);
      p = isl_printer_print_str(p, ", 0), ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i(");
      p = isl_printer_print_int(p, 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0)};");
      p = isl_printer_end_line(p);      

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_d = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_d >> ");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i >> ");
      p = isl_printer_print_int(p, 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    } else {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[n] = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "(");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0);");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, " = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, " >> ");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }
  else if (hls->target == INTEL_HW && nxt_n_lane > 1) {    
    for (int i = 0; i < n_lane / nxt_n_lane; i++) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, "]");
      if (nxt_n_lane > 1)
        p = isl_printer_print_str(p, ".data");
      p = isl_printer_print_str(p, " = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, ".data.s");
      for (int j = 0; j < nxt_n_lane; j++) {
        p = isl_printer_print_str(p, vector_index[j + i * nxt_n_lane]);
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }    
  }
  else if (hls->target == CATAPULT_HW) {
    for (int i = 0; i < n_lane / nxt_n_lane; i++) {
      if (is_sparse) {
        /* data_split[].set_slc(0, out_data_i.slc<>()); */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "].set_slc(0, ");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, "_i.slc<");
        p = isl_printer_print_int(p, 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);

        /* data_split[].set_slc(xx, out_data_d.slc<>()); */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "].set_slc(");
        p = isl_printer_print_int(p, 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ", ");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, "_d.slc<");        
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane * n_nzero);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
      } else {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "] = ");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, ".slc<");        
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  } else if (hls->target == TAPA_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int n = 0; n < ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, "; n++) {");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS UNROLL");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    if (is_sparse) {
      /* data_split[n] = {out_data_d(), ...} */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[n] = (");
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_s_t");
      p = isl_printer_print_int(p, nxt_n_lane);
      if (nxt_n_lane == 1) {
        p = isl_printer_print_str(p, "){");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, "_d[n], ");
      } else {
        p = isl_printer_print_str(p, "){tapa::truncated<");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, "_d, ");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, "* n), ");
      }
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i(");
      p = isl_printer_print_int(p, 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0)};");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i = ");
      p = isl_printer_print_str(p, data_str);
      p = isl_printer_print_str(p, "_i >> ");
      p = isl_printer_print_int(p, 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    } else {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "data_split[n] = ");
      if (nxt_n_lane == 1) {
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, "[n];");
      } else {
        p = isl_printer_print_str(p, "tapa::truncated<");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_str(p, data_str);
        p = isl_printer_print_str(p, ", ");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, " * n);");
      }
      p = isl_printer_end_line(p);
    }

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }

  return p;
}

static __isl_give isl_printer *io_transfer_read_data_split(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;

  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  if (is_sparse) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "out_data = data_split[split_idx];");
    p = isl_printer_end_line(p);
  } else {
    if (hls->target == XILINX_HW) {
      if (nxt_n_lane == 1) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "union {unsigned int ui; ");
        p = isl_printer_print_str(p, group->array->type);
        p = isl_printer_print_str(p, " ut;} u;");
        p = isl_printer_end_line(p);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "u.ui = (unsigned int)data_split[split_idx];");
        p = isl_printer_end_line(p);
      }
    }

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "out_data = ");
    if (hls->target == XILINX_HW) {
      if (nxt_n_lane == 1) {
        p = isl_printer_print_str(p, "u.ut");
      } else {
        p = isl_printer_print_str(p, "data_split[split_idx]");
      }
    } else if (hls->target == INTEL_HW) {
      if (nxt_n_lane > 1)
        p = isl_printer_print_str(p, "data_split[split_idx]");
      else      
        p = isl_printer_print_str(p, "in_data.data[split_idx]");
    } else if (hls->target == CATAPULT_HW) {
      p = isl_printer_print_str(p, "data_split[split_idx]");
    } else if (hls->target == TAPA_HW) {
      p = isl_printer_print_str(p, "data_split[split_idx]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);    
  }

  return p;
}

static __isl_give isl_printer *autosa_kernel_print_io_transfer(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix, 
  char *in_fifo_suffix, char *out_fifo_suffix,
  enum IO_TRANS_DIR in, enum IO_TRANS_DIR out) 
{
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;  
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  isl_ast_expr *local_index_packed = isl_ast_expr_copy(stmt->u.i.local_index);  
  int n_arg;
  int boundary = stmt->u.i.boundary;
  /* If the statement is a boundary statement, 
   * then ignore the filter condition by setting filter_sched_depth as -1
   */
  if (boundary)
    stmt->u.i.filter_sched_depth = -1;

  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  /* Pre-process the local index. */
  if (group->local_array->is_sparse) {
    isl_ast_expr *arg, *div;
    n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
    arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
    div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, vec_len * n_lane));
    arg = isl_ast_expr_div(arg, div);
    local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
  } else {
    if (n_lane > 1)
    {
      isl_ast_expr *arg, *div;
      n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
      arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, n_lane));
      arg = isl_ast_expr_div(arg, div);
      local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
    }
  }

  p = ppcg_start_block(p);  

  /* Declare some common variables here. */  
  int in_n_lane, out_n_lane;
  if (module->in) {    
    in_n_lane = n_lane;
    out_n_lane = nxt_n_lane;    
  } else {
    in_n_lane = nxt_n_lane;
    out_n_lane = n_lane;
  }

  /* [type_in] in_data; */
  p = isl_printer_start_line(p);
  if (group->local_array->is_sparse) {
    p = autosa_print_array_type_with_lane_sparse(p, group->array, in_n_lane);
  } else {    
    p = isl_printer_print_str(p, stmt->u.i.array->name);    
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, in_n_lane);
  } 
  p = isl_printer_print_str(p, " in_data;");
  p = isl_printer_end_line(p);
  
  /* [type_out] out_data; */
  p = isl_printer_start_line(p);
  if (group->local_array->is_sparse) {
    p = autosa_print_array_type_with_lane_sparse(p, group->array, out_n_lane);
  } else {    
    p = isl_printer_print_str(p, stmt->u.i.array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, out_n_lane);    
  }
  p = isl_printer_print_str(p, " out_data;");
  p = isl_printer_end_line(p);  

  if (n_lane != nxt_n_lane) {
    /* [type_nxt_n_lane] data_split[]; */
    if (hls->target == XILINX_HW ||
        hls->target == CATAPULT_HW ||
        hls->target == TAPA_HW ||
      (hls->target == INTEL_HW && nxt_n_lane > 1)) {
      p = isl_printer_start_line(p);
      if (is_sparse) {
        p = autosa_print_array_type_with_lane_sparse(p, group->array, nxt_n_lane);
      } else {
        if (nxt_n_lane == 1) {
          if (hls->target == XILINX_HW) {
            p = isl_printer_print_str(p, "ap_uint<");
            p = isl_printer_print_int(p, group->array->size * 8);
            p = isl_printer_print_str(p, ">");
          } else if (hls->target == TAPA_HW) {
            p = isl_printer_print_str(p, group->array->type);
          } else if (hls->target == INTEL_HW) {
            p = isl_printer_print_str(p, group->array->type);
          } else if (hls->target == CATAPULT_HW) {
            p = isl_printer_print_str(p, group->array->name);
            p = isl_printer_print_str(p, "_t");
            p = isl_printer_print_int(p, nxt_n_lane);
          }       
        } else {
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, nxt_n_lane);
        }
      }
      p = isl_printer_print_str(p, " data_split[");
      p = isl_printer_print_int(p, n_lane / nxt_n_lane);
      p = isl_printer_print_str(p, "];");
      p = isl_printer_end_line(p);

      if (hls->target == XILINX_HW || hls->target == TAPA_HW)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=data_split complete");
        p = isl_printer_end_line(p);
      }
    }     
  }
  
  if ((in == GLOBAL_BUF || in == LOCAL_BUF) && (n_lane != nxt_n_lane)) {
    /* Insert guards. */
    /* if (cx % xx == 0) { */
    if (stmt->u.i.coalesce_depth >= 0) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "if (");
      if (iterator_prefix != NULL) {
        p = isl_printer_print_str(p, iterator_prefix);
      } else {
        p = isl_printer_print_str(p, "c");
      }    
      p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
      p = isl_printer_print_str(p, " % ");
      p = isl_printer_print_int(p, n_lane / nxt_n_lane);
      p = isl_printer_print_str(p, " == 0) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);
    }
  }

  /* Read in data */
  if (in == GLOBAL_BUF) {
    /* in_data = global_buf[]; */
    p = isl_printer_start_line(p);        
    p = isl_printer_print_str(p, "in_data = ");
    p = io_stmt_print_global_index(p, stmt, stmt->u.i.serialize);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  } else if (in == LOCAL_BUF) {    
    /* in_data = local_buf[]; */
    p = isl_printer_start_line(p);   
    p = isl_printer_print_str(p, "in_data = ");

    if (stmt->u.i.module->double_buffer && 
          stmt->u.i.module->options->autosa->double_buffer_style == 0) {  
      isl_ast_expr *op;

      op = isl_ast_expr_op_get_arg(local_index_packed, 0);
      p = isl_printer_print_ast_expr(p, op);
      isl_ast_expr_free(op);
      p = isl_printer_print_str(p, stmt->u.i.in? "[arb]" : "[!arb]");
      for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
        op = isl_ast_expr_op_get_arg(local_index_packed, n);
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_ast_expr(p, op);
        p = isl_printer_print_str(p, "]");
        isl_ast_expr_free(op);
      }
    } else if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
      isl_ast_expr *op;

      op = isl_ast_expr_op_get_arg(local_index_packed, 0);
      p = isl_printer_print_ast_expr(p, op);    
      isl_ast_expr_free(op);
      p = isl_printer_print_str(p, "_tmp.data");
      for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
        op = isl_ast_expr_op_get_arg(local_index_packed, n);
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_ast_expr(p, op);
        p = isl_printer_print_str(p, "]");
        isl_ast_expr_free(op);
      }      
    } else {
      p = isl_printer_print_ast_expr(p, local_index_packed);
    }
    
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  } else if (in == FIFO) {
    char *fifo_in_name;
    fifo_in_name = concat(ctx, stmt->u.i.in_fifo_name, in_fifo_suffix);        
    
    /* in_data = fifo_in.read(); */
    p = isl_printer_start_line(p);  
    p = isl_printer_print_str(p, "in_data = ");
    if (hls->target == XILINX_HW)
      p = print_fifo_rw_xilinx(p, fifo_in_name, 1);
    else if (hls->target == TAPA_HW)
      p = print_fifo_rw_tapa(p, fifo_in_name, 1);
    else if (hls->target == INTEL_HW)
      p = print_fifo_rw_intel(p, fifo_in_name, 1);      
    else if (hls->target == CATAPULT_HW)
      p = print_fifo_rw_catapult(p, fifo_in_name, 1);  
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);  

    free(fifo_in_name);
  }

  /* Re-pack data in the middle. */
  if (n_lane == nxt_n_lane) {
    if (stmt->u.i.reduce) {
      p = autosa_print_reduce_default(p, stmt, n_lane, local_index_packed, group);
    } else {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "out_data = in_data;");
      p = isl_printer_end_line(p);
    }
  } else {
    if (out == FIFO) {
      /* write_data_split: data_split[] = in_data... */
      p = io_transfer_write_data_split(p, stmt, hls, iterator_prefix, "in_data");
    }    

    if ((in == GLOBAL_BUF || in == LOCAL_BUF) && (n_lane != nxt_n_lane)) {
      /* Insert guards. */
      /* if (cx % xx == 0) { */
      if (stmt->u.i.coalesce_depth >= 0) {
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }

    /* calculate_split_idx: split_idx = ... */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int split_idx = (");    
    p = io_stmt_print_index_last_dim(
          p, stmt, stmt->u.i.serialize, ((in == GLOBAL_BUF) || (out == GLOBAL_BUF))? 1 : 0,
          n_lane, nxt_n_lane, is_sparse, vec_len);
    p = isl_printer_print_str(p, ") % ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    if (out == GLOBAL_BUF) {
      /* update_data_split: data_split[split_i] = in_data; */
      p = io_transfer_update_data_split(p, stmt, hls, iterator_prefix);

      /* pack_out_data: out_data = (data_split[], ...); */
      p = io_transfer_pack_out_data(p, stmt, hls, iterator_prefix);
    } else if (out == LOCAL_BUF) {
      /* read_local_buf: out_data = local_buf[...]; */
      p = io_transfer_read_local_buf(p, stmt, hls, iterator_prefix, local_index_packed);

      /* parse_sparse_data */
      if (is_sparse) {
        p = io_transfer_parse_sparse_data(p, stmt, hls, iterator_prefix);
      }

      /* write_data_split: data_split[] = out_data... */
      p = io_transfer_write_data_split(p, stmt, hls, iterator_prefix, "out_data");

      /* update_data_split: data_split[split_i] = in_data; */
      p = io_transfer_update_data_split(p, stmt, hls, iterator_prefix);

      /* pack_out_data: out_data = (data_split[], ...) */
      p = io_transfer_pack_out_data(p, stmt, hls, iterator_prefix);
    } else if (out == FIFO) {
      /* read_data_split: out_data = data_split[split_i]; */
      p = io_transfer_read_data_split(p, stmt, hls, iterator_prefix);
    }
  }

  if ((out == GLOBAL_BUF || out == LOCAL_BUF) && (n_lane != nxt_n_lane)) {
    if (stmt->u.i.coalesce_depth >= 0) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "if (");
      if (iterator_prefix != NULL) {
        p = isl_printer_print_str(p, iterator_prefix);
      } else {
        p = isl_printer_print_str(p, "c");
      }            
      p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
      p = isl_printer_print_str(p, " % ");
      p = isl_printer_print_int(p, n_lane / nxt_n_lane);
      p = isl_printer_print_str(p, " == ");
      p = isl_printer_print_int(p, n_lane / nxt_n_lane);
      p = isl_printer_print_str(p, " - 1 || c");
      p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
      p = isl_printer_print_str(p, " == ");
      p = isl_printer_print_int(p, stmt->u.i.coalesce_bound - 1);
      p = isl_printer_print_str(p, ") {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);
    }
  }

  /* Write out data. */
  if (out == GLOBAL_BUF) {
    /* global_buf[] = in_data; */
    p = isl_printer_start_line(p);   
    p = io_stmt_print_global_index(p, stmt, stmt->u.i.serialize);
    p = isl_printer_print_str(p, " = out_data;");
    p = isl_printer_end_line(p);
  } else if (out == LOCAL_BUF) {      
    /* local_buf[] = fifo_data; */
    //if (stmt->u.i.reduce) {
    //  p = autosa_print_reduce_default(p, stmt, n_lane, local_index_packed, group);
    //} else {
      p = isl_printer_start_line(p);

      if (stmt->u.i.module->double_buffer && 
            stmt->u.i.module->options->autosa->double_buffer_style == 0) {
        isl_ast_expr *op;
              
        op = isl_ast_expr_op_get_arg(local_index_packed, 0);
        p = isl_printer_print_ast_expr(p, op);
        isl_ast_expr_free(op);
        p = isl_printer_print_str(p, stmt->u.i.in? "[arb]" : "[!arb]");
        for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
            op = isl_ast_expr_op_get_arg(local_index_packed, n);
            p = isl_printer_print_str(p, "[");
            p = isl_printer_print_ast_expr(p, op);
            p = isl_printer_print_str(p, "]");
            isl_ast_expr_free(op);
        }        
      } else if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
        isl_ast_expr *op;

        op = isl_ast_expr_op_get_arg(local_index_packed, 0);
        p = isl_printer_print_ast_expr(p, op);    
        isl_ast_expr_free(op);
        p = isl_printer_print_str(p, "_tmp.data");
        for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
          op = isl_ast_expr_op_get_arg(local_index_packed, n);
          p = isl_printer_print_str(p, "[");
          p = isl_printer_print_ast_expr(p, op);
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }        
      } else {        
        p = isl_printer_print_ast_expr(p, local_index_packed);        
      }

      p = isl_printer_print_str(p, " ");
      //if (stmt->u.i.reduce) {        
      //  p = isl_printer_print_str(p, stmt->u.i.reduce_op);        
      //}               
      p = isl_printer_print_str(p, "= out_data;");
      p = isl_printer_end_line(p);
    //}    
  } else if (out == FIFO) {      
    char *fifo_out_name;
    fifo_out_name = concat(ctx, stmt->u.i.out_fifo_name, out_fifo_suffix);      

    /* fifo_out.write(fifo_data); */          
    p = isl_printer_start_line(p);
    if (hls->target == XILINX_HW)
      p = print_fifo_rw_xilinx(p, fifo_out_name, 0);
    else if (hls->target == TAPA_HW)
      p = print_fifo_rw_tapa(p, fifo_out_name, 0);
    else if (hls->target == INTEL_HW)
      p = print_fifo_rw_intel(p, fifo_out_name, 0);
    else if (hls->target == CATAPULT_HW)
      p = print_fifo_rw_catapult(p, fifo_out_name, 0);
    p = isl_printer_print_str(p, "out_data);");
    p = isl_printer_end_line(p);
   
    free(fifo_out_name);    
  }

  if ((out == GLOBAL_BUF || out == LOCAL_BUF) && (n_lane != nxt_n_lane)) {
    if (stmt->u.i.coalesce_depth >= 0) {
      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
  }

  p = ppcg_end_block(p);

  isl_ast_expr_free(local_index_packed);

  return p;
}

/* This function extracts the necessary information for generating I/O transfer statements and 
 * calls the final function to generate the statements.
 */
static __isl_give isl_printer *autosa_kernel_print_io_transfer_wrapper(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,  
  struct hls_info *hls, const char *iterator_prefix
) {
  int n_lane, nxt_n_lane;
  enum IO_TRANS_DIR in, out;
  char in_fifo_suffix[100], out_fifo_suffix[100];

  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (stmt->type == AUTOSA_KERNEL_STMT_IO_DRAM) {
    if (stmt->u.i.in) {
      if (module->is_serialized) {
        in = FIFO;
        //sprintf(in_fifo_suffix, "serialize");
        sprintf(in_fifo_suffix, "in");
      } else {
        in = GLOBAL_BUF;
      }

      if (stmt->u.i.buf) {
        out = LOCAL_BUF;
      } else {
        out = FIFO;
        sprintf(out_fifo_suffix, "out");
      }      
    } else {
      if (stmt->u.i.buf) {
        in = LOCAL_BUF;
      } else {
        in = FIFO;
        sprintf(in_fifo_suffix, "in");
      }

      if (module->is_serialized) {
        out = FIFO;
        //sprintf(out_fifo_suffix, "serialize");
        sprintf(out_fifo_suffix, "out");
      } else {
        out = GLOBAL_BUF;
      }
    }
  } else if (stmt->type == AUTOSA_KERNEL_STMT_IO_TRANSFER) {
    if (stmt->u.i.in) {
      in = FIFO;
      sprintf(in_fifo_suffix, "in");

      if (stmt->u.i.buf) {
        out = LOCAL_BUF;
      } else {
        out = FIFO;
        sprintf(out_fifo_suffix, "out");
      }
    } else {
      if (stmt->u.i.buf) {
        in = LOCAL_BUF;
      } else {
        in = FIFO;
        sprintf(in_fifo_suffix, "in");
      }

      out = FIFO;
      sprintf(out_fifo_suffix, "out");
    }    
  }

  p = autosa_kernel_print_io_transfer(
    p, stmt, hls, iterator_prefix, in_fifo_suffix, out_fifo_suffix, in, out);

  return p;
}

/* Print an I/O transfer statement.
 * is_filter = 0
 * is_buf = 1
 * An in I/O statement is printed as
 *
 *  [type] fifo_data;
 *  [type2] buf_data;
 *  [type] buf_data_split[];
 *  buf_data = local_buf[...];
 *  fifo_data = fifo.read();
 *  for (int n = 0; n < n_lane / nxt_n_lane; n++) {
 *    buf_data_split[n] = buf_data();
 *    buf_data = buf_data >> DW;
 *  }
 *  buf_data_split[...] = Reinterpret<>(fifo_data);
 *  buf_data = (buf_data_split[1], ...);
 *  local_buf[...] = buf_data;
 *
 * An out I/O staement is printed as 
 *
 *  [type] fifo_data;
 *  [type2] buf_data;
 *  [type] buf_data_split[];
 *  buf_data = local_buf[...];
 *  for (int n = 0; n < n_lane / nxt_n_lane; n++) {
 *    buf_data_split[n] = buf_data();
 *    buf_data = buf_data >> DW;
 *  }
 *  fifo_data = Reinterpret<>(buf_data_split[...]);
 *  fifo.write(fifo_data);
 */
static __isl_give isl_printer *autosa_kernel_print_io_transfer_data_pack(
  __isl_take isl_printer *p, struct autosa_kernel_stmt *stmt,
  struct autosa_array_ref_group *group, int n_lane, int nxt_n_lane,
  struct hls_info *hls, const char *iterator_prefix, int global, int buffer)
{
  isl_ctx *ctx;
  ctx = isl_printer_get_ctx(p);
  int boundary = stmt->u.i.boundary;

  char *fifo_name;
  isl_ast_expr *expr, *op;
  int n_arg;
  int r;
  isl_val *val;
  isl_ast_expr *local_index_packed;
  isl_ast_expr *arg, *div;
  local_index_packed = isl_ast_expr_copy(stmt->u.i.local_index);
  /* Extract the sparse data */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  /* Modify the local index. */
  if (is_sparse) {
    n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
    arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
    div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, vec_len * n_lane));
    arg = isl_ast_expr_div(arg, div);
    local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
  } else {
    if (n_lane > 1)
    {
      n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
      arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, n_lane));
      arg = isl_ast_expr_div(arg, div);
      local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
    }
  }

  /* [type] fifo_data; */
  p = isl_printer_start_line(p);
  if (is_sparse) 
    p = autosa_print_array_type_with_lane_sparse(p, group->array, nxt_n_lane);
  else
    p = autosa_print_array_type_with_lane(p, group->array, nxt_n_lane);  
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, "fifo_data;");
  p = isl_printer_end_line(p);

  /* [type2] buf_data; */
  p = isl_printer_start_line(p);
  if (is_sparse) {
    p = autosa_print_array_type_with_lane_sparse(p, group->array, n_lane);
  } else {
    p = isl_printer_print_str(p, group->array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, n_lane);
  }
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, "buf_data;");
  p = isl_printer_end_line(p);

  /* [type] buf_data_split[]; */  
  if (hls->target == XILINX_HW ||
      hls->target == CATAPULT_HW ||
      hls->target == TAPA_HW ||
      (hls->target == INTEL_HW && nxt_n_lane > 1)) {
    p = isl_printer_start_line(p);
    if (is_sparse) {
      p = autosa_print_array_type_with_lane_sparse(p, group->array, nxt_n_lane);
    } else {
      if (nxt_n_lane == 1)
      {
        if (hls->target == XILINX_HW)
        {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, group->array->size * 8);
          p = isl_printer_print_str(p, ">");
        }
        else if (hls->target == TAPA_HW)
        {
          p = isl_printer_print_str(p, group->array->type);
        }
        else if (hls->target == INTEL_HW)
        {
          p = isl_printer_print_str(p, group->array->type);
        }
        else if (hls->target == CATAPULT_HW)
        {
          p = isl_printer_print_str(p, group->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, nxt_n_lane);
        }
      }
      else
      {
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, nxt_n_lane);
      }
    }
    p = isl_printer_print_str(p, " buf_data_split[");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, "];");
    p = isl_printer_end_line(p);
    if (hls->target == XILINX_HW || hls->target == TAPA_HW)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=buf_data_split complete");
      p = isl_printer_end_line(p);
    }
  }
  
  if (stmt->u.i.in && stmt->u.i.coalesce_depth >= 0)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (");
    if (iterator_prefix != NULL) {
      p = isl_printer_print_str(p, iterator_prefix);
    } else {
      p = isl_printer_print_str(p, "c");
    }    
    p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
    p = isl_printer_print_str(p, " % ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, " == 0) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
  }
  /* buf_data = local[]; */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "buf_data = ");
  if (stmt->u.i.module->double_buffer && 
      stmt->u.i.module->options->autosa->double_buffer_style == 0) {
    isl_ast_expr *op;
    op = isl_ast_expr_op_get_arg(local_index_packed, 0);
    p = isl_printer_print_ast_expr(p, op);    
    isl_ast_expr_free(op);
    p = isl_printer_print_str(p, stmt->u.i.in? "[arb]" : "[!arb]");
    for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
      op = isl_ast_expr_op_get_arg(local_index_packed, n);
      p = isl_printer_print_str(p, "[");
      p = isl_printer_print_ast_expr(p, op);
      p = isl_printer_print_str(p, "]");
      isl_ast_expr_free(op);
    }
  } else {
    if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
      isl_ast_expr *op;
      op = isl_ast_expr_op_get_arg(local_index_packed, 0);
      p = isl_printer_print_ast_expr(p, op);    
      isl_ast_expr_free(op);
      p = isl_printer_print_str(p, "_tmp.data");
      for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
        op = isl_ast_expr_op_get_arg(local_index_packed, n);
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_ast_expr(p, op);
        p = isl_printer_print_str(p, "]");
        isl_ast_expr_free(op);
      }
    } else {
      p = isl_printer_print_ast_expr(p, local_index_packed);
    }
  }

  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  if (is_sparse) {
    /* [type] buf_data_d = buf_data.d; */
    p = isl_printer_start_line(p);
    p = autosa_print_array_type_with_lane(p, group->array, n_lane * n_nzero);
    p = isl_printer_print_str(p, " buf_data_d = buf_data.d;");
    p = isl_printer_end_line(p);

    /* [type] buf_data_i = buf_data.i; */
    p = isl_printer_start_line(p);
    if (hls->target == XILINX_HW || hls->target == TAPA_HW) {
      p = isl_printer_print_str(p, "ap_uint<");
      p = isl_printer_print_int(p, 8 * n_lane);
    } else if (hls->target == CATAPULT_HW) {
      p = isl_printer_print_str(p, "ac_int<");
      p = isl_printer_print_int(p, 8 * n_lane);
      p = isl_printer_print_str(p, ", false");
    }
    p = isl_printer_print_str(p, "> buf_data_i = buf_data.i;");
    p = isl_printer_end_line(p);      
  }

  if (hls->target == XILINX_HW)
  {    
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int n = 0; n < ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, "; n++) {");
    p = isl_printer_end_line(p);
        
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS UNROLL");
    p = isl_printer_end_line(p);    
    p = isl_printer_indent(p, 2);

    if (is_sparse) {
      /* buf_data_split[n] = {buf_data_d(), ...} */    
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_split[n] = (");
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_s_t");
      p = isl_printer_print_int(p, nxt_n_lane);
      p = isl_printer_print_str(p, "){buf_data_d(");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero - 1);
      p = isl_printer_print_str(p, ", 0), buf_data_i(");
      p = isl_printer_print_int(p, 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0)};");
      p = isl_printer_end_line(p);      

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_d = buf_data_d >> ");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_i = buf_data_i >> ");
      p = isl_printer_print_int(p, 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    } else {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_split[n] = buf_data(");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0);");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data = buf_data >> ");
      p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }
  else if (hls->target == INTEL_HW && nxt_n_lane > 1) 
  {    
    for (int i = 0; i < n_lane / nxt_n_lane; i++)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_split[");
      p = isl_printer_print_int(p, i);
      p = isl_printer_print_str(p, "]");
      if (nxt_n_lane > 1)
        p = isl_printer_print_str(p, ".data");
      p = isl_printer_print_str(p, " = buf_data.data.s");
      for (int j = 0; j < nxt_n_lane; j++)
      {
        p = isl_printer_print_str(p, vector_index[j + i * nxt_n_lane]);
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }    
  }
  else if (hls->target == CATAPULT_HW) {
    for (int i = 0; i < n_lane / nxt_n_lane; i++) {
      if (is_sparse) {
        /* buf_data_split[].set_slc(0, buf_data_i.slc<>()); */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "buf_data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "].set_slc(0, ");
        p = isl_printer_print_str(p, "buf_data_i.slc<");
        p = isl_printer_print_int(p, 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);

        /* buf_data_split[].set_slc(xx, buf_data_d.slc<>()); */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "buf_data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "].set_slc(");
        p = isl_printer_print_int(p, 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ", buf_data_d.slc<");;
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane * n_nzero);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane * n_nzero);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
      } else {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "buf_data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "] = buf_data.slc<");
        p = isl_printer_print_int(p, group->array->size * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ">(");
        p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  } else if (hls->target == TAPA_HW) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int n = 0; n < ");
    p = isl_printer_print_int(p, n_lane / nxt_n_lane);
    p = isl_printer_print_str(p, "; n++) {");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS UNROLL");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    if (is_sparse) {
      /* buf_data_split[n] = {buf_data_d(), ...} */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_split[n] = (");
      p = isl_printer_print_str(p, group->array->name);
      p = isl_printer_print_str(p, "_s_t");
      p = isl_printer_print_int(p, nxt_n_lane);
      p = isl_printer_print_str(p, "){");
      if (nxt_n_lane == 1)
        p = isl_printer_print_str(p, "buf_data_d[n]");
      else {
        p = isl_printer_print_str(p, "tapa::truncated<");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, ">(buf_data_d, ");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, "* n)");
      }
      p = isl_printer_print_str(p, ", buf_data_i(");
      p = isl_printer_print_int(p, 8 * nxt_n_lane - 1);
      p = isl_printer_print_str(p, ", 0)};");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_i = buf_data_i >> ");
      p = isl_printer_print_int(p, 8 * nxt_n_lane);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    } else {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buf_data_split[n] = ");
      if (nxt_n_lane == 1)
        p = isl_printer_print_str(p, "buf_data[n]");
      else {
        p = isl_printer_print_str(p, "tapa::truncated<");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, ">(buf_data, ");
        p = isl_printer_print_int(p, nxt_n_lane);
        p = isl_printer_print_str(p, "* n)");
      }
      p = isl_printer_end_line(p);
    }

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }

  if (stmt->u.i.in && stmt->u.i.coalesce_depth >= 0)
  {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  /* split_i = ... */
  expr = isl_ast_expr_copy(stmt->u.i.local_index);
  n_arg = isl_ast_expr_op_get_n_arg(expr);
  op = isl_ast_expr_op_get_arg(expr, n_arg - 1);
  r = n_lane / nxt_n_lane;
  if (is_sparse) 
    val = isl_val_int_from_si(ctx, vec_len * nxt_n_lane);
  else
    val = isl_val_int_from_si(ctx, nxt_n_lane);
  op = isl_ast_expr_div(op, isl_ast_expr_from_val(val));
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int split_i = (");
  p = isl_printer_print_ast_expr(p, op);
  p = isl_printer_print_str(p, ") % ");
  p = isl_printer_print_int(p, r);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_ast_expr_free(op);
  isl_ast_expr_free(expr);
  if (stmt->u.i.in)
  {
    fifo_name = concat(ctx, stmt->u.i.in_fifo_name, "in");
    /* fifo_data = fifo.read(); */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fifo_data = ");
    if (hls->target == XILINX_HW)
      p = print_fifo_rw_xilinx(p, fifo_name, 1);
    else if (hls->target == TAPA_HW)
      p = print_fifo_rw_tapa(p, fifo_name, 1);
    else if (hls->target == INTEL_HW)
      p = print_fifo_rw_intel(p, fifo_name, 1);
    else if (hls->target == CATAPULT_HW)
      p = print_fifo_rw_catapult(p, fifo_name, 1);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
      /* buf_data_split[...] = Reinterpret<>(fifo_data); */
    if (hls->target == XILINX_HW ||
        hls->target == TAPA_HW ||
        hls->target == CATAPULT_HW || 
        (hls->target == INTEL_HW && nxt_n_lane > 1)) {
      if (stmt->u.i.reduce) {
        p = autosa_print_reduce_data_pack(p, stmt, nxt_n_lane, n_lane, group, hls->target); // TODO
      } else {      
        if (hls->target == XILINX_HW)
        {
          if (nxt_n_lane == 1)
          {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "union {unsigned int ui; ");
            p = isl_printer_print_str(p, group->array->type);
            p = isl_printer_print_str(p, " ut;} u;");
            p = isl_printer_end_line(p);
  
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "u.ut = fifo_data;");
            p = isl_printer_end_line(p);
          }
        }
  
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "buf_data_split[split_i] ");
        if (stmt->u.i.reduce) {
          p = isl_printer_print_str(p, stmt->u.i.reduce_op);
        }
        p = isl_printer_print_str(p, "= ");
  
        if (hls->target == XILINX_HW)
        {
          if (nxt_n_lane == 1)
          {
            p = isl_printer_print_str(p, "ap_uint<");
            p = isl_printer_print_int(p, group->array->size * 8);
            p = isl_printer_print_str(p, ">(u.ui);");
          }
          else
          {
            p = isl_printer_print_str(p, "fifo_data;");
          }
        }
        else 
        {
          p = isl_printer_print_str(p, "fifo_data;");
        }
        p = isl_printer_end_line(p);      
      }
  
      if (stmt->u.i.coalesce_depth >= 0)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "if (");
        if (iterator_prefix != NULL) {
          p = isl_printer_print_str(p, iterator_prefix);
        } else {
          p = isl_printer_print_str(p, "c");
        }            
        p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
        p = isl_printer_print_str(p, " % ");
        p = isl_printer_print_int(p, n_lane / nxt_n_lane);
        p = isl_printer_print_str(p, " == ");
        p = isl_printer_print_int(p, n_lane / nxt_n_lane);
        p = isl_printer_print_str(p, " - 1 || c");
        p = isl_printer_print_int(p, stmt->u.i.coalesce_depth);
        p = isl_printer_print_str(p, " == ");
        p = isl_printer_print_int(p, stmt->u.i.coalesce_bound - 1);
        p = isl_printer_print_str(p, ") {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
      }
    }
    /* buf_data = (buf_data_split[1], ...); */
    p = isl_printer_start_line(p);
    if (hls->target == XILINX_HW)
    {
      int first = 1;
      p = isl_printer_print_str(p, "buf_data = (");
      for (int i = n_lane / nxt_n_lane - 1; i >= 0; i--)
      {
        if (!first)
          p = isl_printer_print_str(p, ", ");
        p = isl_printer_print_str(p, "buf_data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "]");
          first = 0;
      }
      p = isl_printer_print_str(p, ");");
    } else if (hls->target == INTEL_HW)
    {
      if (nxt_n_lane == 1) {
        p = isl_printer_print_str(p, "buf_data.data[split_i] = fifo_data;");
      } else {
        int first = 1;
        p = isl_printer_print_str(p, "buf_data.data = ");
        p = isl_printer_print_str(p, "(");
        p = isl_printer_print_str(p, group->array->type);
        p = isl_printer_print_int(p, n_lane);
        p = isl_printer_print_str(p, ")(");
          for (int i = 0; i < n_lane / nxt_n_lane; i++)
        {
          if (!first)
            p = isl_printer_print_str(p, ", ");
            if (nxt_n_lane > 1)
          {
            p = isl_printer_print_str(p, "(");
            p = isl_printer_print_str(p, group->array->type);
            p = isl_printer_print_int(p, nxt_n_lane);
            p = isl_printer_print_str(p, ")");
          }
          p = isl_printer_print_str(p, "buf_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "]");
          if (nxt_n_lane > 1)
          {
            p = isl_printer_print_str(p, ".data");
          }
            first = 0;
        }
        p = isl_printer_print_str(p, ");");
      }
    } else if (hls->target == CATAPULT_HW) {
      for (int i = 0; i < n_lane / nxt_n_lane; i++) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "buf_data.set_slc(");
        p = isl_printer_print_int(p, i * group->array->size * 8 * nxt_n_lane);
        p = isl_printer_print_str(p, ", buf_data_split[");
        p = isl_printer_print_int(p, i);
        p = isl_printer_print_str(p, "]);");
        p = isl_printer_end_line(p);  
      }
    } else if (hls->target == TAPA_HW) {
      for (int i = 0; i < n_lane / nxt_n_lane; i++) {
        if (nxt_n_lane == 1) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "buf_data.set(");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, ", buf_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "]);");
          p = isl_printer_end_line(p);
        } else {
          for (int j = 0; j < nxt_n_lane; j++) {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "buf_data.set(");
            p = isl_printer_print_int(p, i * nxt_n_lane + j);
            p = isl_printer_print_str(p, ", buf_data_split[");
            p = isl_printer_print_int(p, i);
            p = isl_printer_print_str(p, "][");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, "]);");
            p = isl_printer_end_line(p);
          }
        }
      }
    }

      p = isl_printer_end_line(p);
      /* local_buf[...] = buf_data; */
    p = isl_printer_start_line(p);    
    if (stmt->u.i.module->double_buffer && 
        stmt->u.i.module->options->autosa->double_buffer_style == 0) {
      isl_ast_expr *op;
      op = isl_ast_expr_op_get_arg(local_index_packed, 0);
      p = isl_printer_print_ast_expr(p, op);
      isl_ast_expr_free(op);
      p = isl_printer_print_str(p, "[arb]");
      for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
        op = isl_ast_expr_op_get_arg(local_index_packed, n);
        p = isl_printer_print_str(p, "[");
        p = isl_printer_print_ast_expr(p, op);
        p = isl_printer_print_str(p, "]");
        isl_ast_expr_free(op);
      }
    } else {
      if (hls->target == CATAPULT_HW && stmt->u.i.module->is_filter) {
        isl_ast_expr *op;
        op = isl_ast_expr_op_get_arg(local_index_packed, 0);
        p = isl_printer_print_ast_expr(p, op);    
        isl_ast_expr_free(op);
        p = isl_printer_print_str(p, "_tmp.data");
        for (int n = 1; n < isl_ast_expr_op_get_n_arg(local_index_packed); n++) {
          op = isl_ast_expr_op_get_arg(local_index_packed, n);
          p = isl_printer_print_str(p, "[");
          p = isl_printer_print_ast_expr(p, op);
          p = isl_printer_print_str(p, "]");
          isl_ast_expr_free(op);
        }        
      } else {
        p = isl_printer_print_ast_expr(p, local_index_packed);
      }
    }
    p = isl_printer_print_str(p, " = buf_data;");
    p = isl_printer_end_line(p);
      if (stmt->u.i.coalesce_depth >= 0)
    {
      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
      free(fifo_name);
  } else {
    if (is_sparse) {
      /* fifo_data = buf_data_split[...]; */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = buf_data_split[split_i];");
      p = isl_printer_end_line(p);
      /* fifo.write(fifo_data); */
      fifo_name = concat(ctx, stmt->u.i.out_fifo_name, "out");
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");
      p = isl_printer_end_line(p);
      free(fifo_name);
    } else {
      fifo_name = concat(ctx, stmt->u.i.out_fifo_name, "out");
      if (hls->target == XILINX_HW)
      {
        if (nxt_n_lane == 1)
        {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "union {unsigned int ui; ");
          p = isl_printer_print_str(p, group->array->type);
          p = isl_printer_print_str(p, " ut;} u;");
          p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "u.ui = (unsigned int)buf_data_split[split_i];");
          p = isl_printer_end_line(p);
        }
      }
      /* fifo_data = Reinterpret<>(buf_data_split[...]); */    
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (hls->target == XILINX_HW)
      {
        if (nxt_n_lane == 1)
        {
          p = isl_printer_print_str(p, "u.ut");
        }
        else
        {
          p = isl_printer_print_str(p, "buf_data_split[split_i]");
        }
      }
      else if (hls->target == INTEL_HW)
      {
        if (nxt_n_lane > 1)
          p = isl_printer_print_str(p, "buf_data_split[split_i]");
        else      
          p = isl_printer_print_str(p, "buf_data.data[split_i]");
      }
      else if (hls->target == CATAPULT_HW) 
      {
        p = isl_printer_print_str(p, "buf_data_split[split_i]");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);    
        /* fifo.write(fifo_data); */
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");
      p = isl_printer_end_line(p);
        free(fifo_name);
    }
  }

  isl_ast_expr_free(local_index_packed);

  return p;
}

///* Print an I/O transfer statement.
// */
//__isl_give isl_printer *autosa_kernel_print_io_transfer(
//    __isl_take isl_printer *p,
//    struct autosa_kernel_stmt *stmt, struct hls_info *hls, const char *iterator_prefix)
//{
//  struct autosa_hw_module *module = stmt->u.i.module;
//  struct autosa_array_ref_group *group = stmt->u.i.group;
//  int n_lane = stmt->u.i.data_pack;
//  int nxt_n_lane = stmt->u.i.nxt_data_pack;
//  //int is_filter = stmt->u.i.filter;
//  int is_buf = stmt->u.i.buf;
//  isl_ctx *ctx = isl_printer_get_ctx(p);
//
//  //  p = ppcg_start_block(p);
//  if (n_lane == nxt_n_lane) {    
//    p = autosa_kernel_print_io_transfer_default(p, stmt, group, n_lane, hls, iterator_prefix);
//  } else {    
//    p = autosa_kernel_print_io_transfer_data_pack(
//          p, stmt, group, n_lane, nxt_n_lane, hls, iterator_prefix, 0, 1);
//  }
//  //  p = ppcg_end_block(p);
//
//  return p;
//}

/* Print a serialization/deserialization statement.
 * Serialization:
 * X_to[X_cnt++] = X_from[...]
 * Deserizalition:
 * X_to[...] = X_from[X_cnt++]
 */
__isl_give isl_printer *autosa_kernel_print_host_serialize(
  __isl_take isl_printer *p,
  struct autosa_kernel_stmt *stmt,
  struct hls_info *hls)
{
  isl_ast_expr *index, *arg;
  isl_id *id;
  const char *array_name;

  index = stmt->u.s.index;
  p = isl_printer_start_line(p);
  arg = isl_ast_expr_get_op_arg(index, 0);
  id = isl_ast_expr_id_get_id(arg);
  array_name = isl_id_get_name(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);

  arg = isl_ast_expr_get_op_arg(index, 1);

  if (stmt->u.s.in) {
    p = isl_printer_print_str(p, array_name);
    p = isl_printer_print_str(p, "_to[cnt++] = ");    
    p = isl_printer_print_str(p, array_name);
    p = isl_printer_print_str(p, "_from[");
    if (stmt->u.s.group->local_array->is_sparse)
      p = isl_printer_print_str(p, "(");
    p = isl_printer_print_ast_expr(p, arg);
    if (stmt->u.s.group->local_array->is_sparse)
      p = isl_printer_print_str(p, ") / EFF_COMPRESS_RATIO");
    p = isl_printer_print_str(p, "];");
  } else {
    p = isl_printer_print_str(p, array_name);
    p = isl_printer_print_str(p, "_to[");
    p = isl_printer_print_ast_expr(p, arg);
    p = isl_printer_print_str(p, "] = ");
    p = isl_printer_print_str(p, array_name);
    p = isl_printer_print_str(p, "_from[cnt++];");    
  }
  p = isl_printer_end_line(p);
  isl_ast_expr_free(arg);

  return p;
}

/* Print a drain merge statement.
 *
 * [group_array_prefix]_to[...] = [group_array_prefix]_from[...]
 */
__isl_give isl_printer *autosa_kernel_print_drain_merge(__isl_take isl_printer *p,
                                                        struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  isl_ast_expr *index_to, *index_from, *arg;
  isl_ctx *ctx = hls->ctx;
  struct autosa_drain_merge_func *func = stmt->u.dm.func;
  isl_ast_expr *index = stmt->u.dm.index;
  int n_arg;
  isl_id *id;
  const char *array_name;
  char *new_array_name;
  isl_printer *p_str;

  p = isl_printer_start_line(p);
  // TODO
  n_arg = isl_ast_expr_get_op_n_arg(index);
  /* Modify the index. */
  arg = isl_ast_expr_get_op_arg(index, 0);
  id = isl_ast_expr_id_get_id(arg);
  array_name = isl_id_get_name(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, array_name);
  p_str = isl_printer_print_str(p_str, "_to");
  new_array_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  id = isl_id_alloc(ctx, new_array_name, NULL);
  arg = isl_ast_expr_from_id(id);
  free(new_array_name);
  index_to = isl_ast_expr_set_op_arg(isl_ast_expr_copy(index), 0, arg);

  arg = isl_ast_expr_get_op_arg(index, 0);
  id = isl_ast_expr_id_get_id(arg);
  array_name = isl_id_get_name(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  p_str = isl_printer_to_str(ctx);
  p_str = isl_printer_print_str(p_str, array_name);
  p_str = isl_printer_print_str(p_str, "_from");
  new_array_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  id = isl_id_alloc(ctx, new_array_name, NULL);
  arg = isl_ast_expr_from_id(id);
  free(new_array_name);
  index_from = isl_ast_expr_set_op_arg(isl_ast_expr_copy(index), 0, arg);

  p = isl_printer_print_ast_expr(p, index_to);
  p = isl_printer_print_str(p, " = ");
  p = isl_printer_print_ast_expr(p, index_from);
  p = isl_printer_print_str(p, ";");

  isl_ast_expr_free(index_to);
  isl_ast_expr_free(index_from);

  p = isl_printer_end_line(p);

  return p;
}

/* Print an I/O dram statement.
 *
 * An in I/O statement is printed as 
 *
 *  [type] fifo_data;
 *  fifo_data = global;
 *  or 
 *  fifo_data = fifo_[arr].read() // when serialize is enabled
 *  fifo.write(fifo_data);
 *
 * while an out I/O statement is printed as
 *
 *  [type] fifo_data;
 *  fifo_data = fifo.read();
 *  global = fifo_data;
 *  or 
 *  fifo_[arr].write(fifo_data); // when serialize is enabled
 */
__isl_give isl_printer *autosa_kernel_print_io_dram(
  __isl_take isl_printer *p,
  struct autosa_kernel_stmt *stmt, struct hls_info *hls,
  const char *iterator_prefix)
{
  // TODO: add when data packing factors are different.
  struct autosa_array_ref_group *group = stmt->u.i.group;
  struct autosa_hw_module *module = stmt->u.i.module;
  char *fifo_name;
  int n_lane = stmt->u.i.data_pack;
  int nxt_n_lane = stmt->u.i.nxt_data_pack;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  int buf = stmt->u.i.buf;
  isl_ast_expr *local_index_packed;  
  int n_arg;  
  /* Extract the sparse data. */
  int is_sparse = group->local_array->is_sparse;
  int vec_len = stmt->u.i.local_array->vec_len;
  int n_nzero = stmt->u.i.local_array->n_nzero;
  float compress_ratio = stmt->u.i.local_array->compress_ratio;
  int n_meta_data = stmt->u.i.local_array->n_meta_data;
  float eff_compress_ratio = stmt->u.i.local_array->eff_compress_ratio;

  local_index_packed = isl_ast_expr_copy(stmt->u.i.local_index);
  /* Modify the local index; */
  if (group->local_array->is_sparse) {
    isl_ast_expr *arg, *div;
    n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
    arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
    div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, vec_len * n_lane));
    arg = isl_ast_expr_div(arg, div);
    local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
  } else {
    if (n_lane > 1)
    {
      isl_ast_expr *arg, *div;
      n_arg = isl_ast_expr_get_op_n_arg(local_index_packed);
      arg = isl_ast_expr_get_op_arg(local_index_packed, n_arg - 1);
      div = isl_ast_expr_from_val(isl_val_int_from_si(ctx, n_lane));
      arg = isl_ast_expr_div(arg, div);
      local_index_packed = isl_ast_expr_set_op_arg(local_index_packed, n_arg - 1, arg);
    }
  }

  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "{");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);

  /* Declare the fifo data variable. */
  p = isl_printer_start_line(p);
  if (group->local_array->is_sparse) {
    p = autosa_print_array_type_with_lane_sparse(p, group->array, nxt_n_lane);
  } else {    
    p = isl_printer_print_str(p, stmt->u.i.array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, nxt_n_lane);    
  }
  p = isl_printer_print_str(p, " fifo_data;");
  p = isl_printer_end_line(p);

  if (stmt->u.i.in)
  {
    /* Generate the serialize fifo name */
    isl_printer *p_str;
    char *serialize_fifo_name;
    p_str = isl_printer_to_str(ctx);
    p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
    p_str = isl_printer_print_str(p_str, "_serialize");
    serialize_fifo_name = isl_printer_get_str(p_str);
    isl_printer_free(p_str);

    p = isl_printer_start_line(p);    
    p = isl_printer_print_str(p, "fifo_data = ");        
    if (module->is_serialized) {
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, serialize_fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, serialize_fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, serialize_fifo_name, 1);      
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, serialize_fifo_name, 1);      
    } else {
      p = io_stmt_print_global_index(p, stmt, stmt->u.i.serialize);    
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    free(serialize_fifo_name);

    if (!buf) {            
      fifo_name = concat(ctx, stmt->u.i.out_fifo_name, "out");      
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");
      p = isl_printer_end_line(p);
      free(fifo_name);      
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_ast_expr(p, local_index_packed);
      p = isl_printer_print_str(p, " = fifo_data;");
      p = isl_printer_end_line(p);
    }
  }
  else
  {
    if (!buf)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");      
      fifo_name = concat(ctx, stmt->u.i.in_fifo_name, "in");      
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
      free(fifo_name);
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      p = isl_printer_print_ast_expr(p, local_index_packed);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }

    p = isl_printer_start_line(p);    
    if (module->is_serialized) {
      /* Generate serialize fifo name */
      isl_printer *p_str;
      char *serialize_fifo_name;
      p_str = isl_printer_to_str(ctx);
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      p_str = isl_printer_print_str(p_str, "_serialize");
      serialize_fifo_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, serialize_fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, serialize_fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, serialize_fifo_name, 0);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, serialize_fifo_name, 0);
      p = isl_printer_print_str(p, "fifo_data);");

      free(serialize_fifo_name);
    } else {
      p = io_stmt_print_global_index(p, stmt, stmt->u.i.serialize);
      p = isl_printer_print_str(p, " = fifo_data;");
    }
    p = isl_printer_end_line(p);
  }

  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "}");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);

  isl_ast_expr_free(local_index_packed);

  return p;
}

static __isl_give isl_printer *print_inter_trans_module_call(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct autosa_kernel *kernel, struct hls_info *hls, int arb, int boundary)
{
  int n = isl_id_list_n_id(module->inst_ids);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  if (prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_print_str(p, "<");
    for (int i = 0; i < n; i++) {
      if (i > 0) 
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
  }
  if (hls->target == CATAPULT_HW) {
    p = isl_printer_print_str(p, "_inst.run");
  }
  p = isl_printer_print_str(p, "(");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = print_module_arguments(p, prog, kernel, module, 0,
                             hls->target, 1, arb, boundary, 0);
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the function call for inter_transfer module. */
__isl_give isl_printer *autosa_kernel_print_inter_trans(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.f.module;
  struct autosa_kernel *kernel = module->kernel;
  struct autosa_prog *prog = kernel->prog;
  int boundary = stmt->u.f.boundary;

  if (hls->target == CATAPULT_HW) {    
    p = print_inter_trans_module_call(p, module, prog, kernel, hls, 0, boundary);
  } else {
    if (module->double_buffer)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "if (arb == 0) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);
    }

    p = print_inter_trans_module_call(p, module, prog, kernel, hls, 0, boundary);

    if (module->double_buffer)
    {
      p = isl_printer_indent(p, -2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "} else {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = print_inter_trans_module_call(p, module, prog, kernel, hls, 1, boundary);

      p = isl_printer_indent(p, -2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "}");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

static __isl_give isl_printer *print_intra_trans_module_call(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct hls_info *hls, int arb)
{
  int n = isl_id_list_n_id(module->inst_ids);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, module->name);
  p = isl_printer_print_str(p, "_intra_trans");
  if (prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_print_str(p, "<");
    for (int i = 0; i < n; i++) {
      if (i > 0) 
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
  }
  if (hls->target == CATAPULT_HW) {
    p = isl_printer_print_str(p, "_inst.run");
  }
  p = isl_printer_print_str(p, "(");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = print_module_arguments(p, prog, kernel, module, 0, 
                             hls->target, 0, arb, 0, 0);
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  return p;
}

/* Print the function call for intra_transfer module. */
__isl_give isl_printer *autosa_kernel_print_intra_trans(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.f.module;
  struct autosa_kernel *kernel = module->kernel;
  struct autosa_prog *prog = kernel->prog;

  if (hls->target == CATAPULT_HW) {
    p = print_intra_trans_module_call(p, module, prog, kernel, hls, 1);
  } else {
    if (module->double_buffer)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "if (arb == 0) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);
    }

    p = print_intra_trans_module_call(p, module, prog, kernel, hls, 0);

    if (module->double_buffer)
    {
      p = isl_printer_indent(p, -2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "} else {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = print_intra_trans_module_call(p, module, prog, kernel, hls, 1);

      p = isl_printer_indent(p, -2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "}");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

/* Print the function calls for inter_transfer and intra_tranfer modules. */
__isl_give isl_printer *autosa_kernel_print_inter_intra(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.f.module;
  struct autosa_kernel *kernel = module->kernel;
  struct autosa_prog *prog = kernel->prog;
  int boundary = stmt->u.f.boundary;

  if (module->double_buffer && hls->target != CATAPULT_HW)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (arb == 0) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
  }

  /* inter_trans */
  p = print_inter_trans_module_call(p, module, prog, kernel, hls, 0, boundary);
  /* intra_trans */
  p = print_intra_trans_module_call(p, module, prog, kernel, hls, 0);

  if (module->double_buffer && hls->target != CATAPULT_HW)
  {
    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "} else {");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, 2);

    /* inter_trans */
    p = print_inter_trans_module_call(p, module, prog, kernel, hls, 1, boundary);
    /* intra_trans */
    p = print_intra_trans_module_call(p, module, prog, kernel, hls, 1);

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print the function calls for intra_transfer and inter_tranfer modules. */
__isl_give isl_printer *autosa_kernel_print_intra_inter(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.f.module;
  struct autosa_kernel *kernel = module->kernel;
  struct autosa_prog *prog = kernel->prog;
  int boundary = stmt->u.f.boundary;

  if (module->double_buffer && hls->target != CATAPULT_HW)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (arb == 0) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
  }

  /* intra_trans */
  p = print_intra_trans_module_call(p, module, prog, kernel, hls, 0);
  /* inter_trans */
  p = print_inter_trans_module_call(p, module, prog, kernel, hls, 0, boundary);

  if (module->double_buffer && hls->target != CATAPULT_HW)
  {
    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "} else {");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, 2);

    /* intra_trans */
    p = print_intra_trans_module_call(p, module, prog, kernel, hls, 1);
    /* inter_trans */
    p = print_inter_trans_module_call(p, module, prog, kernel, hls, 1, boundary);

    p = isl_printer_indent(p, -2);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "}");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print the state transfer for double buffers. */
__isl_give isl_printer *autosa_kernel_print_state_handle(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls)
{
  struct autosa_hw_module *module = stmt->u.f.module;
  isl_space *space;
  int n;

  if (hls->target == CATAPULT_HW)
    return p;

  if (module->in)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "intra_trans_en = 1;");
    p = isl_printer_end_line(p);
  }
  else
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "inter_trans_en = 1;");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "arb = !arb;");
  p = isl_printer_end_line(p);

  if (module->in)
  {
    /* intra trans */
    space = module->intra_space;
  }
  else
  {
    /* inter trans */
    space = module->inter_space;
  }
  n = isl_space_dim(space, isl_dim_set);
  for (int i = 0; i < n; i++)
  {
    const char *name;
    name = isl_space_get_dim_name(space, isl_dim_set, i);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, "_prev = ");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print the body for a module that connects to the DRAM with serialized data. 
 */
__isl_give isl_printer *print_module_serialize_body(
  __isl_take isl_printer *p, struct autosa_hw_module *module, struct hls_info *hls)
{
  isl_pw_qpolynomial *total_bound_pwq = module->io_groups[0]->array->local_array->serialize_bound;
  long int total_bound = -1;  
  int ele_size = module->io_groups[0]->array->size; // bytes
  total_bound = convert_pwqpoly_to_int(total_bound_pwq);
  int data_pack_in = module->data_pack_serialize;
  int data_pack_out = module->data_pack_inter;  
  char *fifo_name;
  isl_printer *p_str;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  /* Extract the sparse information */
  int is_sparse = module->io_groups[0]->local_array->is_sparse;
  int vec_len = module->io_groups[0]->local_array->vec_len;
  int n_nzero = module->io_groups[0]->local_array->n_nzero;
  float compress_ratio = module->io_groups[0]->local_array->compress_ratio;
  int n_meta_data = module->io_groups[0]->local_array->n_meta_data;
  float eff_compress_ratio = module->io_groups[0]->local_array->eff_compress_ratio;

  int axi_stream = module->options->autosa->axi_stream;

  p_str = isl_printer_to_str(ctx);
  p_str = autosa_array_ref_group_print_fifo_name(module->io_groups[0], p_str);  
  fifo_name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  
  if (data_pack_in == data_pack_out) {    
    if (module->in) { 
      char *new_fifo_name;

      if (hls->target == INTEL_HW)
        p = print_str_new_line(p, "#pragma loop_coalesce");
      else if (hls->target == CATAPULT_HW)
        p = print_str_new_line(p, "#pragma hls_pipeline_init_interval 1");

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");      
      if (is_sparse)
        p = isl_printer_print_int(p, total_bound / eff_compress_ratio / data_pack_in);
      else
        p = isl_printer_print_int(p, total_bound / data_pack_out);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
          
      if (hls->target == XILINX_HW || hls->target == TAPA_HW)
        p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");

      p = isl_printer_indent(p, 2);
      p = isl_printer_start_line(p);
      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (axi_stream) {
        //char *fifo_name;
        //isl_printer *p_str;
        //p_str = isl_printer_to_str(ctx);
        //p_str = isl_printer_print_str(p_str,"fifo_");
        //p_str = isl_printer_print_str(p_str, module->io_groups[0]->array->name);
        //fifo_name = isl_printer_get_str(p_str);
        //isl_printer_free(p_str);

        if (hls->target == XILINX_HW)
          p = print_fifo_rw_xilinx(p, fifo_name, 1);
        else if (hls->target == TAPA_HW)
          p = print_fifo_rw_tapa(p, fifo_name, 1);
        else if (hls->target == INTEL_HW)
          p = print_fifo_rw_intel(p, fifo_name, 1);
        else if (hls->target == CATAPULT_HW)
          p = print_fifo_rw_catapult(p, fifo_name, 1);
        p = isl_printer_print_str(p, ";");

        //free(fifo_name);
      } else {
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "[i];");
      }
      p = isl_printer_end_line(p);

      new_fifo_name = concat(ctx, fifo_name, "local_out");
      p = isl_printer_start_line(p);
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, new_fifo_name, 0);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, new_fifo_name, 0);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, new_fifo_name, 0);          
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, new_fifo_name, 0);          

      p = isl_printer_print_str(p, "fifo_data);");      
      p = isl_printer_end_line(p);
      free(new_fifo_name);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");            
    } else {
      char *new_fifo_name;

      if (hls->target == INTEL_HW)
        p = print_str_new_line(p, "#pragma loop_coalesce");
      else if (hls->target == CATAPULT_HW)
        p = print_str_new_line(p, "#pragma hls_pipeline_init_interval 1");

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, total_bound / data_pack_out);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);

      if (hls->target == XILINX_HW || hls->target == TAPA_HW)
        p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");

      p = isl_printer_indent(p, 2);
      p = isl_printer_start_line(p);
      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);      
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);

      new_fifo_name = concat(ctx, fifo_name, "local_in");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_data = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, new_fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, new_fifo_name, 1);
      else if (hls->target == INTEL_HW)
        p = print_fifo_rw_intel(p, new_fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, new_fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      if (axi_stream) {
        //char *fifo_name;
        //isl_printer *p_str;
        //p_str = isl_printer_to_str(ctx);
        //p_str = isl_printer_print_str(p_str,"fifo_");
        //p_str = isl_printer_print_str(p_str, module->io_groups[0]->array->name);
        //fifo_name = isl_printer_get_str(p_str);
        //isl_printer_free(p_str);
        
        if (hls->target == XILINX_HW)
          p = print_fifo_rw_xilinx(p, fifo_name, 0);
        else if (hls->target == TAPA_HW)
          p = print_fifo_rw_tapa(p, fifo_name, 0);
        else if (hls->target == INTEL_HW)
          p = print_fifo_rw_intel(p, fifo_name, 0);
        else if (hls->target == CATAPULT_HW)
          p = print_fifo_rw_catapult(p, fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_print_str(p, ";");

        //free(fifo_name);        
      } else {
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "[i] = fifo_data;");
      }
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      free(new_fifo_name);
    }
  } else {    
    if (module->in) {
      char *new_fifo_name;

      /* [type] fifo_data; */
      p = isl_printer_start_line(p);      
      if (is_sparse)
        p = autosa_print_array_type_with_lane_sparse(p, module->io_groups[0]->array, data_pack_out);
      else
        p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);

      /* [type2] mem_data; */
      p = isl_printer_start_line(p);
      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);
      p = isl_printer_print_str(p, " mem_data;");
      p = isl_printer_end_line(p);
      
      if (hls->target == XILINX_HW) {
        if (data_pack_out == 1 && !is_sparse) {
          /* union {unsigned int ui; [type] ut;} u; */
          p = isl_printer_start_line(p);        
          p = isl_printer_print_str(p, "union {unsigned int ui; ");
          p = isl_printer_print_str(p, module->io_groups[0]->array->type);
          p = isl_printer_print_str(p, " ut;} u;");        
          p = isl_printer_end_line(p);
        }        
          
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        if (is_sparse)
          p = isl_printer_print_int(p, total_bound / eff_compress_ratio / data_pack_in);
        else
          p = isl_printer_print_int(p, total_bound / data_pack_in);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
            
        p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");            
        p = isl_printer_indent(p, 2);
  
        /* mem_data = array[]; */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data = ");
        if (axi_stream) {
          //char *fifo_name;
          //isl_printer *p_str;
          //p_str = isl_printer_to_str(ctx);
          //p_str = isl_printer_print_str(p_str,"fifo_");
          //p_str = isl_printer_print_str(p_str, module->io_groups[0]->array->name);
          //fifo_name = isl_printer_get_str(p_str);
          //isl_printer_free(p_str);

          if (hls->target == XILINX_HW)
            p = print_fifo_rw_xilinx(p, fifo_name, 1);
          else if (hls->target == TAPA_HW)
            p = print_fifo_rw_tapa(p, fifo_name, 1);
          else if (hls->target == INTEL_HW)
            p = print_fifo_rw_intel(p, fifo_name, 1);
          else if (hls->target == CATAPULT_HW)
            p = print_fifo_rw_catapult(p, fifo_name, 1);
          p = isl_printer_print_str(p, ";");

          //free(fifo_name);
        } else {
          p = isl_printer_print_str(p, module->io_groups[0]->array->name);
          p = isl_printer_print_str(p, "[i];");
        }
        p = isl_printer_end_line(p);
  
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int p = 0; p < ");
        if (is_sparse)
          p = isl_printer_print_int(p, data_pack_in / (n_nzero + n_meta_data) / data_pack_out);
        else
          p = isl_printer_print_int(p, data_pack_in / data_pack_out);
        p = isl_printer_print_str(p, "; p++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        if (is_sparse) {
          /* ap_uint<...> mem_data_tmp = mem_data(...); */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out);
          p = isl_printer_print_str(p, "> mem_data_tmp = mem_data(");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out - 1);
          p = isl_printer_print_str(p, ", 0);");
          p = isl_printer_end_line(p);

          /* mem_data = mem_data >> ...; */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data = mem_data >> ");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);

          /* fifo_data.d = ... */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data.d = (");
          for (int n = data_pack_out - 1; n >= 0; n--) {
            p = isl_printer_print_str(p, "(ap_uint<");
            p = isl_printer_print_int(p, ele_size * 8 * n_nzero);
            p = isl_printer_print_str(p, ">)");
            p = isl_printer_print_str(p, "mem_data_tmp(");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data) + ele_size * 8 * n_nzero - 1);
            p = isl_printer_print_str(p, ", ");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data));
            p = isl_printer_print_str(p, ")");
            if (n > 0) 
              p = isl_printer_print_str(p, ", ");
          }
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);

          /* fifo_data.i = ... */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data.i = (");
          for (int n = data_pack_out - 1; n >= 0; n--) {
            p = isl_printer_print_str(p, "(ap_uint<8>)mem_data_tmp(");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data) + ele_size * 8 * n_nzero + 8 - 1);
            p = isl_printer_print_str(p, ", ");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data) + ele_size * 8 * n_nzero);
            p = isl_printer_print_str(p, ")");
            if (n > 0) 
              p = isl_printer_print_str(p, ", ");
          }
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
        } else {
          /* fifo_data = mem_data(..,..); */
          p = isl_printer_start_line(p);
          if (data_pack_out == 1) {
            p = isl_printer_print_str(p, "u.ui = (unsigned int)mem_data(");
            p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
            p = isl_printer_print_str(p, ", 0);");
            p = isl_printer_end_line(p);

            p = print_str_new_line(p, "fifo_data = u.ut;");
          } else {
            p = isl_printer_print_str(p, "fifo_data = mem_data(");
            p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
            p = isl_printer_print_str(p, ", 0);");
          }
          p = isl_printer_end_line(p);

          /* mem_data = mem_data >> .. */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data = mem_data >> ");
          p = isl_printer_print_int(p, ele_size * data_pack_out * 8);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
  
        new_fifo_name = concat(ctx, fifo_name, "local_out");
        p = isl_printer_start_line(p);        
        p = print_fifo_rw_xilinx(p, new_fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        free(new_fifo_name);
      } else if (hls->target == INTEL_HW) {                  
        p = print_str_new_line(p, "#pragma loop_coalesce");

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, total_bound / data_pack_in);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
                  
        p = isl_printer_indent(p, 2);
  
        /* mem_data = array[]; */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data = __burst_coalesced_load(&");
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "[i]);");
        p = isl_printer_end_line(p);
          
        /* [type] mem_data_split[n] */
        p = isl_printer_start_line(p);
        p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
        p = isl_printer_print_str(p, " mem_data_split[");
        p = isl_printer_print_int(p, data_pack_in / data_pack_out);
        p = isl_printer_print_str(p, "];");
        p = isl_printer_end_line(p);

        for (int i = 0; i < data_pack_in / data_pack_out; i++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "].data = mem_data.data.s");
          for (int j = i * data_pack_out; j < i * data_pack_out + data_pack_out; j++) {
            p = isl_printer_print_str(p, vector_index[j]);
          }
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int p = 0; p < ");
        p = isl_printer_print_int(p, data_pack_in / data_pack_out);
        p = isl_printer_print_str(p, "; p++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        /* fifo_data = mem_data(..,..); */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "fifo_data = mem_data_split[p];");                
        p = isl_printer_end_line(p);
          
        new_fifo_name = concat(ctx, fifo_name, "local_out");
        p = isl_printer_start_line(p);
        p = print_fifo_rw_intel(p, new_fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        free(new_fifo_name);        
      } else if (hls->target == CATAPULT_HW) {
        p = print_str_new_line(p, "#pragma hls_pipeline_init_interval 1");

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        if (is_sparse)
          p = isl_printer_print_int(p, total_bound / eff_compress_ratio / data_pack_in);
        else
          p = isl_printer_print_int(p, total_bound / data_pack_in);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, 2);

        /* mem_data = array[]; */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data = ");
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "[i];");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int p = 0; p < ");
        if (is_sparse)
          p = isl_printer_print_int(p, data_pack_in / (n_nzero + n_meta_data) / data_pack_out);
        else
          p = isl_printer_print_int(p, data_pack_in / data_pack_out);
        p = isl_printer_print_str(p, "; p++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        if (is_sparse) {
          /* ap_uint<...> mem_data_tmp = mem_data(...); */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "ac_int<");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out);
          p = isl_printer_print_str(p, ", false> mem_data_tmp = mem_data.slc<");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out - 1);
          p = isl_printer_print_str(p, ">(0);");
          p = isl_printer_end_line(p);

          /* mem_data = mem_data >> ...; */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data = mem_data >> ");
          p = isl_printer_print_int(p, ele_size * (n_nzero + n_meta_data) * 8 * data_pack_out);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);

          /* fifo_data.d = ... */
          for (int n = 0; n < data_pack_out; n++) {
            p = isl_printer_start_line(p);

            p = isl_printer_print_str(p, "fifo_data.d.set_slc(");
            p = isl_printer_print_int(p, n * ele_size * 8 * n_nzero);
            p = isl_printer_print_str(p, ", ");

            p = isl_printer_print_str(p, "mem_data_tmp.slc<");
            p = isl_printer_print_int(p, n * ele_size * 8 * n_nzero);
            p = isl_printer_print_str(p, ">(");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data));
            p = isl_printer_print_str(p, "));");

            p = isl_printer_end_line(p);
          }          

          /* fifo_data.i = ... */
          for (int n = 0; n < data_pack_out; n++) {
            p = isl_printer_start_line(p);
            
            p = isl_printer_print_str(p, "fifo_data.i.set_slc(");
            p = isl_printer_print_int(p, 8 * n);
            p = isl_printer_print_str(p, ", ");

            p = isl_printer_print_str(p, "mem_data_tmp.slc<8>(");
            p = isl_printer_print_int(p, n * ele_size * 8 * (n_nzero + n_meta_data) + ele_size * 8 * n_nzero);
            p = isl_printer_print_str(p, "));");

            p = isl_printer_end_line(p);
          }          
        } else {
          /* fifo_data = mem_data(..,..); */
          //p = isl_printer_start_line(p);
          //if (data_pack_out == 1) {
          //  p = isl_printer_print_str(p, "u.ui = (unsigned int)mem_data(");
          //  p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
          //  p = isl_printer_print_str(p, ", 0);");
          //  p = isl_printer_end_line(p);

          //  p = print_str_new_line(p, "fifo_data = u.ut;");
          //} else {
          //  p = isl_printer_print_str(p, "fifo_data = mem_data(");
          //  p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
          //  p = isl_printer_print_str(p, ", 0);");
          //}
          //p = isl_printer_end_line(p);
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data = mem_data.slc<");
          p = isl_printer_print_int(p, ele_size * data_pack_out * 8);
          p = isl_printer_print_str(p, ">(0);");
          p = isl_printer_end_line(p);

          /* mem_data = mem_data >> .. */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data = mem_data >> ");
          p = isl_printer_print_int(p, ele_size * data_pack_out * 8);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
  
        new_fifo_name = concat(ctx, fifo_name, "local_out");
        p = isl_printer_start_line(p);        
        p = print_fifo_rw_catapult(p, new_fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        free(new_fifo_name);
      } else if (hls->target == TAPA_HW) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        if (is_sparse)
          p = isl_printer_print_int(p, total_bound / eff_compress_ratio / data_pack_in);
        else
          p = isl_printer_print_int(p, total_bound / data_pack_in);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);

        p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
        p = isl_printer_indent(p, 2);

        /* mem_data = array[]; */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data = ");
        if (axi_stream) {
          p = print_fifo_rw_tapa(p, fifo_name, 1);
          p = isl_printer_print_str(p, ";");
        } else {
          p = isl_printer_print_str(p, module->io_groups[0]->array->name);
          p = isl_printer_print_str(p, "[i];");
        }
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int p = 0; p < ");
        if (is_sparse)
          p = isl_printer_print_int(p, data_pack_in / (n_nzero + n_meta_data) / data_pack_out);
        else
          p = isl_printer_print_int(p, data_pack_in / data_pack_out);
        p = isl_printer_print_str(p, "; p++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        if (is_sparse) {
          /* tapa::vec_t<T, size> mem_data_tmp = tapa::truncated<begin, end>(mem_data); */
          p = isl_printer_start_line(p);
          p = autosa_print_array_type_with_lane_sparse(p, module->io_groups[0]->array, data_pack_out);
          p = isl_printer_print_str(p, " mem_data_tmp = ");
          if (data_pack_out == 1) {
            p = isl_printer_print_str(p, "mem_data[p];");
          } else {
            p = isl_printer_print_str(p, "tapa::truncated<");
            p = isl_printer_print_int(p, data_pack_out);
            p = isl_printer_print_str(p, ">(mem_data, ");
            p = isl_printer_print_int(p, data_pack_out);
            p = isl_printer_print_str(p, " * p);");
          }

          /* fifo_data.d = ... */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data.d = ");
          for (int n = 1; n < data_pack_out; n++)
            p = isl_printer_print_str(p, "tapa::cat(");
          for (int n = 0; n < data_pack_out; n++) {
            if (n > 1) p = isl_printer_print_str(p, ")");
            if (n > 0) p = isl_printer_print_str(p, ", ");
            p = isl_printer_print_str(p, "mem_data_tmp[");
            p = isl_printer_print_int(p, n);
            p = isl_printer_print_str(p, "].d");
          }
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);

          /* fifo_data.i = ... */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "fifo_data.i = ");
          for (int n = 1; n < data_pack_out; n++)
            p = isl_printer_print_str(p, "tapa::cat(");
          for (int n = 0; n < data_pack_out; n++) {
            if (n > 1) p = isl_printer_print_str(p, ")");
            if (n > 0) p = isl_printer_print_str(p, ", ");
            p = isl_printer_print_str(p, "mem_data_tmp[");
            p = isl_printer_print_int(p, n);
            p = isl_printer_print_str(p, "].i");
          }
          p = isl_printer_print_str(p, ");");
          p = isl_printer_end_line(p);
        } else {
          /* fifo_data = tapa::truncated<begin, end>(mem_data); */
          p = isl_printer_start_line(p);
          if (data_pack_out == 1) {
            p = print_str_new_line(p, "fifo_data = mem_data[p];");
          } else {
            p = isl_printer_print_str(p, "fifo_data = tapa::truncated<");
            p = isl_printer_print_int(p, data_pack_out);
            p = isl_printer_print_str(p, ">(mem_data, ");
            p = isl_printer_print_int(p, data_pack_out);
            p = isl_printer_print_str(p, " * p);");
          }
          p = isl_printer_end_line(p);
        }

        new_fifo_name = concat(ctx, fifo_name, "local_out");
        p = isl_printer_start_line(p);
        p = print_fifo_rw_xilinx(p, new_fifo_name, 0);
        p = isl_printer_print_str(p, "fifo_data);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");

        free(new_fifo_name);
      }
    } else {
      char *new_fifo_name;
      if (hls->target == INTEL_HW)
        p = print_str_new_line(p, "#pragma loop_coalesce");
      else if (hls->target == CATAPULT_HW)
        p = print_str_new_line(p, "#pragma hls_pipeline_init_interval 1");

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, total_bound / data_pack_in);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
          
      if (hls->target == XILINX_HW || hls->target == TAPA_HW)
        p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
      p = isl_printer_indent(p, 2);

      /* [type] fifo_data; */
      p = isl_printer_start_line(p);
      //if (data_pack_out == 1) {
      //  p = isl_printer_print_str(p, module->io_groups[0]->array->type);
      //} else {
      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
      //}
      p = isl_printer_print_str(p, " fifo_data;");
      p = isl_printer_end_line(p);      

      /* [type2] mem_data; */
      p = isl_printer_start_line(p);
      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);      
      p = isl_printer_print_str(p, " mem_data;");
      p = isl_printer_end_line(p);            
      
      p = isl_printer_start_line(p);      
      if (data_pack_out == 1) {
        if (hls->target == XILINX_HW) {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, module->io_groups[0]->array->size * 8);
          p = isl_printer_print_str(p, ">");
        } else if (hls->target == INTEL_HW) {
          p = isl_printer_print_str(p, module->io_groups[0]->array->type);
        } else if (hls->target == TAPA_HW) {
          p = isl_printer_print_str(p, module->io_groups[0]->array->type);
        } else if (hls->target == CATAPULT_HW) {
          p = isl_printer_print_str(p, "ac_int<");
          p = isl_printer_print_int(p, module->io_groups[0]->array->size * 8);
          p = isl_printer_print_str(p, ", false>");
        }
      } else {
        p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
      }
      p = isl_printer_print_str(p, " mem_data_split[");
      p = isl_printer_print_int(p, data_pack_in / data_pack_out);
      p = isl_printer_print_str(p, "];");
      p = isl_printer_end_line(p);

      if (hls->target == XILINX_HW || hls->target == TAPA_HW)
        p = print_str_new_line(p, "#pragma HLS ARRAY_PARTITION variable=mem_data_split complete");
      
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int p = 0; p < ");
      p = isl_printer_print_int(p, data_pack_in / data_pack_out);
      p = isl_printer_print_str(p, "; p++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      new_fifo_name = concat(ctx, fifo_name, "local_in");
      p = isl_printer_print_str(p, "fifo_data = ");
      if (hls->target == XILINX_HW)
        p = print_fifo_rw_xilinx(p, new_fifo_name, 1);
      else if (hls->target == TAPA_HW)
        p = print_fifo_rw_tapa(p, new_fifo_name, 1);
      else if (hls->target == INTEL_HW) 
        p = print_fifo_rw_intel(p, new_fifo_name, 1);
      else if (hls->target == CATAPULT_HW)
        p = print_fifo_rw_catapult(p, new_fifo_name, 1);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      if (hls->target == XILINX_HW) {
        if (data_pack_out == 1) {
          /* union {unsigned int ui; [type] ut;} u; */
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "union {unsigned int ui; ");
          p = isl_printer_print_str(p, module->io_groups[0]->array->type);
          p = isl_printer_print_str(p, " ut;} u;");        
          p = isl_printer_end_line(p);

          p = print_str_new_line(p, "u.ut = fifo_data;");

          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data_split[p] = ap_uint<");
          p = isl_printer_print_int(p, module->io_groups[0]->array->size * 8);
          p = isl_printer_print_str(p, ">(u.ui);");
          p = isl_printer_end_line(p);
        } else {
          p = print_str_new_line(p, "mem_data_split[p] = fifo_data;");
        }
      } else if (hls->target == INTEL_HW) {
        p = print_str_new_line(p, "mem_data_split[p] = fifo_data;");
      } else if (hls->target == TAPA_HW) {
        p = print_str_new_line(p, "mem_data_split[p] = fifo_data;");
      } else if (hls->target == CATAPULT_HW) {
        p = print_str_new_line(p, "mem_data_split[p] = fifo_data;");
      }
      
      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (hls->target == XILINX_HW) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data = (");
        for (int i = data_pack_in / data_pack_out - 1; i >= 0; i--) {
          if (i < data_pack_in / data_pack_out - 1)
            p = isl_printer_print_str(p, ", ");
          p = isl_printer_print_str(p, "mem_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "]");
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      } else if (hls->target == INTEL_HW) {
        int first = 1;
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "mem_data.data = ");
        p = isl_printer_print_str(p, "(");
        p = isl_printer_print_str(p, module->io_groups[0]->array->type);
        p = isl_printer_print_int(p, data_pack_in);
        p = isl_printer_print_str(p, ")(");

        for (int i = 0; i < data_pack_in / data_pack_out; i++) {
          if (!first)
            p = isl_printer_print_str(p, ", ");
          if (data_pack_out > 1) {
            p = isl_printer_print_str(p, "(");
            p = isl_printer_print_str(p, module->io_groups[0]->array->type);
            p = isl_printer_print_int(p, data_pack_out);
            p = isl_printer_print_str(p, ")");
          }
          p = isl_printer_print_str(p, "mem_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "]");
          if (data_pack_out > 1)  {
            p = isl_printer_print_str(p, ".data");
          }
          first = 0;
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      } else if (hls->target == TAPA_HW) {
        for (int n = 0; n < data_pack_in / data_pack_out; n++) {
          if (data_pack_out == 1) {
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "mem_data.set(");
            p = isl_printer_print_int(p, n);
            p = isl_printer_print_str(p, ", mem_data_split[");
            p = isl_printer_print_int(p, n);
            p = isl_printer_print_str(p, "]);");
            p = isl_printer_end_line(p);
          } else {
            for (int j = 0; j < data_pack_out; j++) {
              p = isl_printer_start_line(p);
              p = isl_printer_print_str(p, "mem_data.set(");
              p = isl_printer_print_int(p, n * data_pack_out + j);
              p = isl_printer_print_str(p, ", mem_data_split[");
              p = isl_printer_print_int(p, n);
              p = isl_printer_print_str(p, "][");
              p = isl_printer_print_int(p, j);
              p = isl_printer_print_str(p, "]);");
              p = isl_printer_end_line(p);
            }
          }
        }
      } else if (hls->target == CATAPULT_HW) {
        for (int i = 0; i < data_pack_in / data_pack_out; i++) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "mem_data.set_slc(");
          p = isl_printer_print_int(p, i * data_pack_out * module->io_groups[0]->array->size * 8);
          p = isl_printer_print_str(p, ", mem_data_split[");
          p = isl_printer_print_int(p, i);
          p = isl_printer_print_str(p, "]);");
          p = isl_printer_end_line(p);
        }
      }

      if (hls->target == XILINX_HW ||
          hls->target == TAPA_HW ||
          hls->target == CATAPULT_HW) {
        p = isl_printer_start_line(p);
        if (axi_stream) {
          //char *fifo_name;
          //isl_printer *p_str;
          //p_str = isl_printer_to_str(ctx);
          //p_str = isl_printer_print_str(p_str,"fifo_");
          //p_str = isl_printer_print_str(p_str, module->io_groups[0]->array->name);
          //fifo_name = isl_printer_get_str(p_str);
          //isl_printer_free(p_str);

          if (hls->target == XILINX_HW)
            p = print_fifo_rw_xilinx(p, fifo_name, 0);
          else if (hls->target == TAPA_HW)
            p = print_fifo_rw_tapa(p, fifo_name, 0);
          else if (hls->target == INTEL_HW)
            p = print_fifo_rw_intel(p, fifo_name, 0);
          else if (hls->target == CATAPULT_HW)
            p = print_fifo_rw_catapult(p, fifo_name, 0);
          p = isl_printer_print_str(p, "mem_data);");
          p = isl_printer_print_str(p, ";");

          //free(fifo_name);  
        } else {
          p = isl_printer_print_str(p, module->io_groups[0]->array->name);
          p = isl_printer_print_str(p, "[i] = mem_data;");
        }
        p = isl_printer_end_line(p);
      } else {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "__burst_coalesced_store(&");
        p = isl_printer_print_str(p, module->io_groups[0]->array->name);
        p = isl_printer_print_str(p, "[i], mem_data);");
        p = isl_printer_end_line(p);
      }

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      free(new_fifo_name);
    }
  }

  free(fifo_name);
  return p;
}

/* Print the macros for the sparse data structure. 
 */
isl_stat print_sparse_macros(struct autosa_kernel *kernel, struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "/* Sparse Macros */");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#define VEC_LEN ");
  p = isl_printer_print_int(p, kernel->vec_len);
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#define NON_ZERO_NUM ");
  p = isl_printer_print_int(p, kernel->n_nzero);
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#define META_DATA_NUM ");
  p = isl_printer_print_int(p, kernel->n_meta_data);
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))");

  p = print_str_new_line(p, "/* Sparse Macros */");
  p = isl_printer_end_line(p);  

  isl_printer_free(p);

  if (hls->hls == 0) {
    p = isl_printer_to_file(kernel->ctx, hls->host_h);
    p = isl_printer_set_output_format(p, ISL_FORMAT_C);
    p = print_str_new_line(p, "/* Sparse Macros */");
  
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#define VEC_LEN ");
    p = isl_printer_print_int(p, kernel->vec_len);
    p = isl_printer_end_line(p);
  
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#define NON_ZERO_NUM ");
    p = isl_printer_print_int(p, kernel->n_nzero);
    p = isl_printer_end_line(p);
  
    p = print_str_new_line(p, "#define COMPRESS_RATIO (VEC_LEN/NON_ZERO_NUM)");
  
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#define META_DATA_NUM ");
    p = isl_printer_print_int(p, kernel->n_meta_data);
    p = isl_printer_end_line(p);
  
    p = print_str_new_line(p, "#define EFF_COMPRESS_RATIO (VEC_LEN/(NON_ZERO_NUM+META_DATA_NUM))");
  
    p = print_str_new_line(p, "/* Sparse Macros */");
    p = isl_printer_end_line(p);  
  
    isl_printer_free(p);    
  }

  return isl_stat_ok;
}

/* Print the arguments to a drain merge function declaration or call.
 * If "types" is set, then print a declaration (including the types of the arguments).
 * 
 * The arguments are printed in the following order:
 * - the module identifiers
 * - the parameters
 * - the host loop iterators
 * - the arrays accssed by the module
 */
__isl_give isl_printer *print_drain_merge_arguments(
    __isl_take isl_printer *p,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group,
    struct autosa_drain_merge_func *func,
    int types,
    int hls)
{
  int first = 1;
  int nparam;
  int n;
  isl_space *space;
  const char *type;
  struct autosa_local_array_info *local_array;

  type = isl_options_get_ast_iterator_type(kernel->ctx);
  /* module identifiers */
  const char *dims[] = {"idx", "idy", "idz"};
  n = isl_id_list_n_id(func->inst_ids);
  for (int i = 0; i < n; ++i)
  {
    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, dims[i]);

    first = 0;
  }

  /* params */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; ++i)
  {
    const char *name;

    name = isl_space_get_dim_name(space, isl_dim_param, i);

    if (!first)
      p = isl_printer_print_str(p, ", ");
    if (types)
      p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, name);

    first = 0;
  }
  isl_space_free(space);

  /* Host iters */
  n = isl_space_dim(kernel->space, isl_dim_set);
  for (int i = 0; i < n; ++i)
  {
    const char *name;

    if (!first)
      p = isl_printer_print_str(p, ", ");
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
    if (types)
    {
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
    }
    p = isl_printer_print_str(p, name);

    first = 0;
  }

  /* Arrays */
  local_array = group->local_array;
  if (!first)
    p = isl_printer_print_str(p, ", ");
  if (types)
  {
    if (hls)
    {
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *");
    }
    else
    {
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> &");
    }
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_to");
  }
  else
  {
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "[0]");
  }
  first = 0;

  if (!first)
    p = isl_printer_print_str(p, ", ");
  if (types)
  {
    if (hls)
    {
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *");
    }
    else
    {
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> &");
    }
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_from");
  }
  else
  {
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "[idx]");
  }
  first = 0;

  return p;
}

struct print_hw_module_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_module *module;
  /* Used for double buffer codegen. Modify the printed iterator prefix. */
  const char *iterator_prefix;
};

/* Print the drained data merge functions. 
 */
isl_stat print_drain_merge_funcs(
    struct autosa_kernel *kernel,
    struct autosa_drain_merge_func **funcs, int n_funcs,
    struct hls_info *hls)
{
  isl_printer *p;
  isl_ctx *ctx;

  if (n_funcs == 0)
    return isl_stat_ok;

  ctx = kernel->ctx;
  if (!hls->hls)
    p = isl_printer_to_file(kernel->ctx, hls->host_h);
  else
    p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  for (int i = 0; i < n_funcs; i++)
  {
    struct autosa_array_ref_group *group = funcs[i]->group;
    isl_ast_print_options *print_options;
    struct print_hw_module_data hw_data = {hls, NULL, NULL, NULL};

    p = print_str_new_line(p, "/* Helper Function */");
    p = isl_printer_start_line(p);
    if (hls->hls)
      p = isl_printer_print_str(p, "inline ");
    p = isl_printer_print_str(p, "void ");
    p = autosa_array_ref_group_print_prefix(group, p);
    p = isl_printer_print_str(p, "_drain_merge(");
    p = print_drain_merge_arguments(p, kernel, group, funcs[i], 1, hls->hls);
    p = isl_printer_print_str(p, "){");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = print_str_new_line(p, "/* Variable Declaration */");
    if (!hls->hls)
      p = print_func_iterators(p, hls->host_h, funcs[i]);
    else
      p = print_func_iterators(p, hls->kernel_h, funcs[i]);
    p = print_str_new_line(p, "/* Variable Declaration */");
    p = isl_printer_end_line(p);

    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_module_stmt, &hw_data);
    p = isl_ast_node_print(funcs[i]->device_tree, p, print_options);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
    p = print_str_new_line(p, "/* Helper Function */");
    p = isl_printer_end_line(p);
  }  
  isl_printer_free(p);

  return isl_stat_ok;
}

__isl_give isl_printer *print_module_stmt(__isl_take isl_printer *p,
                                          __isl_take isl_ast_print_options *print_options,
                                          __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *hw_data = (struct print_hw_module_data *)(user);
  struct autosa_hw_module *module = hw_data->module;

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
    case AUTOSA_KERNEL_STMT_DOMAIN:
      return autosa_kernel_print_domain(p, stmt);
    case AUTOSA_KERNEL_STMT_IO:
      return autosa_kernel_print_io(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_IO_TRANSFER:
    case AUTOSA_KERNEL_STMT_IO_DRAM:
      return autosa_kernel_print_io_transfer_wrapper(
        p, stmt, hw_data->hls, module->options->autosa->double_buffer_style == 0? hw_data->iterator_prefix : NULL);
    //case AUTOSA_KERNEL_STMT_IO_TRANSFER:
    //  return autosa_kernel_print_io_transfer(p, stmt, hw_data->hls, 
    //            module->options->autosa->double_buffer_style == 0?
    //              hw_data->iterator_prefix : NULL);
    //case AUTOSA_KERNEL_STMT_IO_DRAM:
    //  return autosa_kernel_print_io_dram(p, stmt, hw_data->hls,
    //      module->options->autosa->double_buffer_style == 0? hw_data->iterator_prefix : NULL);
    case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS:
      return autosa_kernel_print_inter_trans(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS:
      return autosa_kernel_print_intra_trans(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA:
      return autosa_kernel_print_inter_intra(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER:
      return autosa_kernel_print_intra_inter(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_STATE_HANDLE:
      return autosa_kernel_print_state_handle(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_DRAIN_MERGE:
      return autosa_kernel_print_drain_merge(p, stmt, hw_data->hls);
    case AUTOSA_KERNEL_STMT_HOST_SERIALIZE:
      return autosa_kernel_print_host_serialize(p, stmt, hw_data->hls);
  }

  return p;
}

/* Print the host serialization functions.
 */
isl_stat print_host_serialize_funcs(
    struct autosa_kernel *kernel,
    struct autosa_hw_module **modules,
    int n_modules, struct hls_info *hls)
{
  isl_printer *p;
  isl_ctx *ctx;

  ctx = kernel->ctx;
  if (!hls->hls)
    p = isl_printer_to_file(ctx, hls->host_h);
  else
    p = isl_printer_to_file(ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  for (int i = 0; i < n_modules; i++) {
    struct autosa_hw_module *module = modules[i];
    isl_ast_print_options *print_options;
    struct print_hw_module_data hw_data = {hls, NULL, NULL, NULL};

    if (module->serialize_tree) {
      p = print_str_new_line(p, "/* Helper Function */");
      p = isl_printer_start_line(p);
      if (hls->hls)
        p = isl_printer_print_str(p, "inline ");
      p = isl_printer_print_str(p, "void ");
      if (module->in) {
        p = isl_printer_print_str(p, "host_serialize_");
      } else {
        p = isl_printer_print_str(p, "host_deserialize_");
      }      
      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
      p = isl_printer_print_str(p, "(");      
      p = print_host_serialize_arguments(p, kernel, module->io_groups[0], module, 1, hls->hls);
      p = isl_printer_print_str(p, "){");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = print_str_new_line(p, "/* Variable Declaration */");
      p = print_str_new_line(p, "unsigned int cnt = 0;");      
      p = print_str_new_line(p, "/* Variable Declaration */");
      p = isl_printer_end_line(p);

      print_options = isl_ast_print_options_alloc(ctx);
      print_options = isl_ast_print_options_set_print_user(print_options,
                                                           &print_module_stmt, &hw_data);
            
      p = isl_ast_node_print(module->serialize_tree, p, print_options);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
      p = print_str_new_line(p, "/* Helper Function */");
      p = isl_printer_end_line(p);
    }    
  }
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print a user statement in the generated AST.
 * The ppcg_stmt has been attached to the node in at_each_domain.
 */
__isl_give isl_printer *print_cpu_user(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	struct autosa_kernel_stmt *stmt;
	isl_id *id;

	id = isl_ast_node_get_annotation(node);
	stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
	isl_id_free(id);

	p = pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);

	isl_ast_print_options_free(print_options);

	return p;
}


================================================
FILE: src/autosa_print.h
================================================
#ifndef _AUTOSA_PRINT_H
#define _AUTOSA_PRINT_H

#include <isl/printer.h>

#include "autosa_common.h"

/* Arrays */
__isl_give isl_printer *autosa_array_info_print_call_argument(
    __isl_take isl_printer *p, struct autosa_array_info *array, int n_ref, const char *prefix);
__isl_give isl_printer *autosa_array_ref_group_print_prefix(
    struct autosa_array_ref_group *group, __isl_take isl_printer *p);
__isl_give isl_printer *autosa_array_ref_group_print_fifo_name(
    struct autosa_array_ref_group *group, __isl_take isl_printer *p);
__isl_give isl_printer *autosa_print_types(__isl_take isl_printer *p,
                                           struct autosa_types *types, struct autosa_prog *prog);
__isl_give isl_printer *autosa_print_local_declarations(
    __isl_take isl_printer *p, struct autosa_prog *prog);
__isl_give isl_printer *autosa_array_info_print_data_size(
    __isl_take isl_printer *p, struct autosa_array_info *array);
__isl_give isl_printer *autosa_array_info_print_size(
    __isl_take isl_printer *p, struct autosa_array_info *array);
__isl_give isl_printer *autosa_array_info_print_serialize_data_size(
    __isl_take isl_printer *p, struct autosa_array_info *array);    
__isl_give isl_printer *autosa_array_info_print_serialize_size(
    __isl_take isl_printer *p, struct autosa_array_info *array);    
__isl_give isl_printer *autosa_print_array_type(__isl_take isl_printer *p,
                                                struct autosa_array_info *array);
__isl_give isl_printer *autosa_print_array_type_with_lane(
    __isl_take isl_printer *p,
    struct autosa_array_info *array, int n_lane);
__isl_give isl_printer *autosa_print_array_type_with_lane_sparse(
    __isl_take isl_printer *p,
    struct autosa_array_info *array, int n_lane);
__isl_give isl_printer *autosa_array_info_print_declaration_argument(
    __isl_take isl_printer *p, struct autosa_array_info *array, int n_lane,
    const char *memory_space, int n_ref);
__isl_give isl_printer *autosa_module_array_info_print_call_argument(
    __isl_take isl_printer *p, struct polysa_array_info *array);
__isl_give isl_printer *autosa_print_var_initialization(
    __isl_take isl_printer *p, struct autosa_kernel_var *var, enum platform target);

/* Utils */
__isl_give isl_printer *print_str_new_line(__isl_take isl_printer *p, const char *str);
__isl_give isl_printer *autosa_print_macros(__isl_take isl_printer *p,
                                            __isl_keep isl_ast_node *node);

/* Kernel */
__isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
                                               struct autosa_prog *prog, struct autosa_kernel *kernel,
                                               int types, struct hls_info *hls);
__isl_give isl_printer *print_kernel_header(
    __isl_take isl_printer *p, struct autosa_prog *prog, 
    struct autosa_kernel *kernel, struct hls_info *hls, int types);

/* HW modules */
__isl_give isl_printer *print_module_iterators(
    __isl_take isl_printer *p, FILE *out, struct autosa_hw_module *module);
__isl_give isl_printer *print_module_arguments(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_hw_module *module, int types,
    enum platform target,
    int inter, int arb, int boundary, int serialize);
__isl_give isl_printer *print_pe_dummy_module_arguments(
    __isl_take isl_printer *p,
    struct autosa_prog *prog,
    struct autosa_kernel *kernel,
    struct autosa_pe_dummy_module *pe_dummy_module,
    int types,
    enum platform target);
void print_top_gen_headers(
    struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls);
__isl_give isl_printer *print_top_gen_arguments(__isl_take isl_printer *p,
                                                struct autosa_prog *prog, struct autosa_kernel *kernel, int types);
__isl_give isl_printer *autosa_kernel_print_module_call(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
    enum platform target);
__isl_give isl_printer *autosa_kernel_print_module_call_inst(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog,
    enum platform target);    
__isl_give isl_printer *print_func_iterators(
    __isl_take isl_printer *p,
    FILE *out,
    struct autosa_drain_merge_func *func);
__isl_give isl_printer *print_serialize_counter(
    __isl_take isl_printer *p, 
    struct autosa_hw_module *module);
__isl_give isl_printer *print_host_serialize_arguments(
    __isl_take isl_printer *p,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group,
    struct autosa_hw_module *module,
    int types,
    int hls);    

/* FIFOs */
__isl_give isl_printer *autosa_fifo_print_declaration_arguments(
    __isl_take isl_printer *p, struct autosa_array_ref_group *group, int n_lane,
    const char *suffix, enum platform target, int fifo_depth, const char *direction);
__isl_give isl_printer *autosa_fifo_print_call_argument(
    __isl_take isl_printer *p, struct autosa_array_ref_group *group,
    const char *suffix, enum platform target);
__isl_give isl_printer *autosa_kernel_print_fifo_decl(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct autosa_prog *prog, struct hls_info *hls);

/* Statements */
__isl_give isl_printer *autosa_kernel_print_domain(__isl_take isl_printer *p,
                                                   struct autosa_kernel_stmt *stmt);
__isl_give isl_printer *autosa_kernel_print_io(__isl_take isl_printer *p,
                                               struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_io_transfer(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls, const char *iterator_prefix);
__isl_give isl_printer *autosa_kernel_print_io_dram(__isl_take isl_printer *p,
                                                    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_inter_trans(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_intra_trans(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_intra_inter(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_inter_intra(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_state_handle(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_drain_merge(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt, struct hls_info *hls);
__isl_give isl_printer *autosa_kernel_print_host_serialize(
    __isl_take isl_printer *p,
    struct autosa_kernel_stmt *stmt,
    struct hls_info *hls);    
__isl_give isl_printer *print_module_serialize_body(
    __isl_take isl_printer *p, struct autosa_hw_module *module, struct hls_info *hls);    
__isl_give isl_printer *print_module_stmt(__isl_take isl_printer *p,
                                          __isl_take isl_ast_print_options *print_options,
                                          __isl_keep isl_ast_node *node, void *user);
__isl_give isl_printer *print_cpu_user(
    __isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user);

/* Xilinx-specific */
__isl_give isl_printer *print_fifo_type_xilinx(__isl_take isl_printer *p,
                                               struct autosa_array_ref_group *group, int n_lane);
__isl_give isl_printer *print_fifo_rw_xilinx(__isl_take isl_printer *p,
                                             const char *fifo_name, int read);

/* Intel-specific */
__isl_give isl_printer *print_fifo_type_intel(__isl_take isl_printer *p,
                                              struct autosa_array_ref_group *group, int n_lane);
__isl_give isl_printer *print_fifo_rw_intel(__isl_take isl_printer *p,
                                            const char *fifo_name, int read);

/* Catapult-specific */
__isl_give isl_printer *print_fifo_type_catapult(__isl_take isl_printer *p,
                                                 struct autosa_array_ref_group *group, int n_lane);
__isl_give isl_printer *print_fifo_rw_catapult(__isl_take isl_printer *p,
                                               const char *fifo_name, int read);                                                 

/* TAPA-specific */
__isl_give isl_printer *print_fifo_type_tapa(__isl_take isl_printer *p,
                                             struct autosa_array_ref_group *group,
                                             int n_lane, int fifo_depth, const char *suffix);
__isl_give isl_printer *print_fifo_rw_tapa(__isl_take isl_printer *p,
                                           const char *fifo_name, int read);

/* Sparse */
isl_stat print_sparse_macros(struct autosa_kernel *kernel, struct hls_info *hls);

/* Host functions */
__isl_give isl_printer *print_drain_merge_arguments(
    __isl_take isl_printer *p,
    struct autosa_kernel *kernel,
    struct autosa_array_ref_group *group,
    struct autosa_drain_merge_func *func,
    int types,
    int hls);
isl_stat print_drain_merge_funcs(
    struct autosa_kernel *kernel,
    struct autosa_drain_merge_func **funcs, int n_funcs,
    struct hls_info *hls);
isl_stat print_host_serialize_funcs(
    struct autosa_kernel *kernel,
    struct autosa_hw_module **modules,
    int n_modules, struct hls_info *hls);

#endif


================================================
FILE: src/autosa_schedule_tree.cpp
================================================
/* This file defines functions used to manipulate the schedule trees in AutoSA.
 */
#include <isl/ctx.h>
#include <isl/schedule_node.h>

#include "autosa_common.h"
#include "autosa_utils.h"
#include "autosa_schedule_tree.h"

/* Is "node" a mark node with an identifier called "name"?
 */
int is_marked(__isl_keep isl_schedule_node *node, const char *name)
{
  isl_id *mark;
  int has_name;

  if (!node)
    return -1;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
    return 0;

  mark = isl_schedule_node_mark_get_id(node);
  if (!mark)
    return -1;

  has_name = !strcmp(isl_id_get_name(mark), name);
  isl_id_free(mark);

  return has_name;
}

static __isl_give isl_multi_val *multi_val_from_int_list(
    __isl_take isl_space *space, int *list)
{
  int i, n;
  isl_ctx *ctx;
  isl_multi_val *mv;

  if (!space)
    return NULL;

  ctx = isl_space_get_ctx(space);
  n = isl_space_dim(space, isl_dim_set);
  mv = isl_multi_val_zero(space);
  for (i = 0; i < n; ++i)
  {
    isl_val *v;

    v = isl_val_int_from_si(ctx, list[i]);
    mv = isl_multi_val_set_val(mv, i, v);
  }

  return mv;
}

/* Construct the tile sizes from int array "tile_size".
 */
__isl_give isl_multi_val *construct_band_tile_sizes(
    __isl_keep isl_schedule_node *node, int *tile_size)
{
  isl_space *space;

  if (!node)
    return NULL;

  space = isl_schedule_node_band_get_space(node);
  return multi_val_from_int_list(space, tile_size);
}

/* Extract the pe_opt, space_time, sched_pos property from the band node.
 */
struct autosa_node_band_prop *extract_node_band_prop(__isl_keep isl_schedule_node *node)
{
  struct autosa_node_band_prop *prop = isl_calloc_type(
      isl_schedule_node_get_ctx(node), struct autosa_node_band_prop);
  prop->mupa = isl_schedule_node_band_get_partial_schedule(node);
  prop->n_member = isl_schedule_node_band_n_member(node);
  prop->coincident = isl_calloc_array(isl_schedule_node_get_ctx(node), int,
                                      prop->n_member);
  for (int i = 0; i < prop->n_member; i++)
  {
    prop->coincident[i] = isl_schedule_node_band_member_get_coincident(node, i);
  }
  prop->permutable = isl_schedule_node_band_get_permutable(node);
  prop->space_time = isl_calloc_array(isl_schedule_node_get_ctx(node),
                                      enum autosa_loop_type, prop->n_member);
  prop->pe_opt = isl_calloc_array(isl_schedule_node_get_ctx(node),
                                  enum autosa_loop_type, prop->n_member);
  prop->sched_pos = isl_calloc_array(isl_schedule_node_get_ctx(node),
                                     int, prop->n_member);  
  for (int i = 0; i < prop->n_member; i++)
  {
    prop->space_time[i] = isl_schedule_node_band_member_get_space_time(node, i);
    prop->pe_opt[i] = isl_schedule_node_band_member_get_pe_opt(node, i);
    prop->sched_pos[i] = isl_schedule_node_band_member_get_sched_pos(node, i);
    prop->iter[i] = isl_schedule_node_band_member_get_iter(node, i);
  }  

  return prop;
}

struct autosa_node_band_prop *autosa_node_band_prop_free(
    __isl_take struct autosa_node_band_prop *prop)
{
  isl_multi_union_pw_aff_free(prop->mupa);
  free(prop->coincident);
  free(prop->space_time);
  free(prop->pe_opt);
  free(prop->sched_pos);  

  free(prop);

  return NULL;
}

/* Examines if the "node" is a permutable band node. */
isl_bool is_permutable_node(__isl_keep isl_schedule_node *node)
{
  if (!node)
    return isl_bool_error;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return isl_bool_false;
  if (!isl_schedule_node_band_get_permutable(node))
    return isl_bool_false;
  if (isl_schedule_node_band_n_member(node) < 1)
    return isl_bool_false;

  return isl_bool_true;
}

/* Examines if the node is a permutable band node. If so, 
 * increase the count of permutable node.
 */
static isl_bool is_permutable_node_cnt(
    __isl_keep isl_schedule_node *node, void *user)
{
  isl_val *n_permutable_node = (isl_val *)(user);
  if (!node)
    return isl_bool_error;

  if (is_permutable_node(node) == isl_bool_true)
    n_permutable_node = isl_val_add_ui(n_permutable_node, 1);

  return isl_bool_true;
}

/* Examines that if the program only contains one permutable node and there is
 * no other node beside it.
 */
isl_bool has_single_permutable_node(__isl_keep isl_schedule *schedule)
{
  isl_schedule_node *root;
  root = isl_schedule_get_root(schedule);
  isl_val *n_permutable_node = isl_val_zero(isl_schedule_get_ctx(schedule));
  isl_bool all_permutable_node = isl_schedule_node_every_descendant(root,
                                                                    &is_permutable_node_cnt, n_permutable_node);
  isl_schedule_node_free(root);

  if (all_permutable_node && isl_val_is_one(n_permutable_node))
  {
    isl_val_free(n_permutable_node);
    return isl_bool_true;
  }
  else
  {
    isl_val_free(n_permutable_node);
    return isl_bool_false;
  }
}

/* Examines if the dependence is uniform based on the partial schedule
 * in the node. We will calculate the dependence vector and examine 
 * if each dimension is a constant.
 */
isl_bool is_dep_uniform_at_node(__isl_keep isl_schedule_node *node, void *user)
{
  isl_basic_map *dep = (isl_basic_map *)(user);
  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return isl_bool_true;

  /* By this stage we know that if a node is a band node, it is a 
   * permutable band node to be analyzed. 
   */
  isl_multi_union_pw_aff *p_sc = isl_schedule_node_band_get_partial_schedule(node);
  isl_union_pw_multi_aff *contraction = isl_schedule_node_get_subtree_contraction(node);
  p_sc = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(p_sc, contraction);

  isl_bool is_uniform = isl_bool_true;
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
  {
    isl_union_pw_aff *p_sc_hyp = isl_multi_union_pw_aff_get_union_pw_aff(p_sc, i);
    /* Obtain the schedule for the src statment. */
    isl_space *space = isl_basic_map_get_space(dep);
    isl_space *src_space = isl_space_domain(isl_space_copy(space));
    isl_space *dest_space = isl_space_range(space);

    isl_pw_aff *src_sc;
    isl_pw_aff_list *p_sc_hyp_list = isl_union_pw_aff_get_pw_aff_list(p_sc_hyp);
    for (int j = 0; j < isl_union_pw_aff_n_pw_aff(p_sc_hyp); j++)
    {
      isl_pw_aff *single_sc = isl_pw_aff_list_get_pw_aff(p_sc_hyp_list, j);
      isl_space *single_sc_stmt = isl_space_domain(isl_pw_aff_get_space(single_sc));
      if (isl_space_is_equal(src_space, single_sc_stmt))
      {
        isl_space_free(single_sc_stmt);
        src_sc = single_sc;
        break;
      }
      isl_pw_aff_free(single_sc);
      isl_space_free(single_sc_stmt);
    }
    isl_pw_aff_list_free(p_sc_hyp_list);
    isl_space_free(src_space);

    /* Obtain the schedule for the dest statement. */
    isl_pw_aff *dest_sc;
    p_sc_hyp_list = isl_union_pw_aff_get_pw_aff_list(p_sc_hyp);
    for (int j = 0; j < isl_union_pw_aff_n_pw_aff(p_sc_hyp); j++)
    {
      isl_pw_aff *single_sc = isl_pw_aff_list_get_pw_aff(p_sc_hyp_list, j);
      isl_space *single_sc_stmt = isl_space_domain(isl_pw_aff_get_space(single_sc));
      if (isl_space_is_equal(dest_space, single_sc_stmt))
      {
        isl_space_free(single_sc_stmt);
        dest_sc = single_sc;
        break;
      }
      isl_pw_aff_free(single_sc);
      isl_space_free(single_sc_stmt);
    }
    isl_pw_aff_list_free(p_sc_hyp_list);
    isl_space_free(dest_space);

    /* Compute the dependence distance at the current hyperplane. */
    /* Step 1: Extend the scheduling function. */
    isl_size src_sc_dim = isl_pw_aff_dim(src_sc, isl_dim_in);
    isl_size dest_sc_dim = isl_pw_aff_dim(dest_sc, isl_dim_in);
    src_sc = isl_pw_aff_insert_dims(src_sc, isl_dim_in, src_sc_dim, dest_sc_dim);
    dest_sc = isl_pw_aff_insert_dims(dest_sc, isl_dim_in, 0, src_sc_dim);
    for (int j = 0; j < dest_sc_dim; j++)
    {
      isl_pw_aff_set_dim_id(src_sc, isl_dim_in, src_sc_dim + j, isl_pw_aff_get_dim_id(dest_sc, isl_dim_in, src_sc_dim + j));
    }
    for (int j = 0; j < src_sc_dim; j++)
    {
      isl_pw_aff_set_dim_id(dest_sc, isl_dim_in, j, isl_pw_aff_get_dim_id(src_sc, isl_dim_in, j));
    }

    isl_pw_aff *dis_sc = isl_pw_aff_sub(dest_sc, src_sc);

    /* Step 2: Convert the basic_map into basic_set. */
    isl_mat *eq_mat = isl_basic_map_equalities_matrix(dep,
                                                      isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
    isl_mat *ieq_mat = isl_basic_map_inequalities_matrix(dep,
                                                         isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);

    isl_basic_set *dep_set = isl_basic_set_from_constraint_matrices(
        isl_space_domain(isl_pw_aff_get_space(dis_sc)),
        eq_mat, ieq_mat,
        isl_dim_set, isl_dim_div, isl_dim_param, isl_dim_cst);

    /* Step 3: Intersect the scheduling function with the domain. */
    isl_pw_aff *dis = isl_pw_aff_intersect_domain(dis_sc,
                                                  isl_set_from_basic_set(isl_basic_set_copy(dep_set)));

    isl_union_pw_aff_free(p_sc_hyp);
    isl_basic_set_free(dep_set);

    /* Examine if the dependence distance is constant. */
    if (!isl_pw_aff_is_cst(dis))
    {
      is_uniform = isl_bool_false;
      isl_pw_aff_free(dis);
      break;
    }

    isl_pw_aff_free(dis);
  }

  isl_multi_union_pw_aff_free(p_sc);
  return is_uniform;
}

/* Apply the schedule on the dependence and check if every dimension is a constant. 
 * Dep in the form of S1[]->S2[].
 */
isl_bool is_dep_uniform(__isl_take isl_basic_map *bmap, void *user)
{
  isl_bool is_uniform;
  isl_schedule *schedule = (isl_schedule *)(user);
  isl_schedule_node *root = isl_schedule_get_root(schedule);
  isl_ctx *ctx = isl_basic_map_get_ctx(bmap);

  /* Get the full schedule and apply the schedule to both the domain and range 
   * of the dependence. Generate the set from this map, and apply a map that 
   * calculate the diff at each dimension to get the dependence vector. 
   * At last, check if the dependence vector is a constant vector.
   */
  isl_union_map *full_sched = isl_schedule_node_get_subtree_schedule_union_map(root);
  isl_union_map *dep_tmp = isl_union_map_apply_domain(
      isl_union_map_from_map(isl_map_from_basic_map(bmap)),
      isl_union_map_copy(full_sched));
  isl_union_map *dep = isl_union_map_apply_range(dep_tmp, full_sched);

  isl_schedule_node_free(root);

  isl_map *dep_map = isl_map_from_union_map(dep);
  isl_basic_map *dep_bmap = isl_basic_map_from_map(isl_map_copy(dep_map)); // TODO

  isl_set *src_dep_domain = isl_map_domain(isl_map_copy(dep_map));
  isl_map *src_dep_domain_map = isl_set_identity(src_dep_domain);
  isl_multi_pw_aff *src_mpa = isl_multi_pw_aff_identity(isl_map_get_space(src_dep_domain_map));
  isl_map_free(src_dep_domain_map);

  isl_set *dest_dep_domain = isl_map_range(dep_map);
  isl_map *dest_dep_domain_map = isl_set_identity(dest_dep_domain);
  isl_multi_pw_aff *dest_mpa = isl_multi_pw_aff_identity(isl_map_get_space(dest_dep_domain_map));
  isl_map_free(dest_dep_domain_map);

  /* Add dims */
  isl_size src_dim = isl_multi_pw_aff_dim(src_mpa, isl_dim_in);
  isl_size dest_dim = isl_multi_pw_aff_dim(dest_mpa, isl_dim_in);
  src_mpa = isl_multi_pw_aff_insert_dims(src_mpa, isl_dim_in, src_dim, dest_dim);
  dest_mpa = isl_multi_pw_aff_insert_dims(dest_mpa, isl_dim_in, 0, src_dim);

  isl_multi_pw_aff *dep_dis_mpa = isl_multi_pw_aff_sub(dest_mpa, src_mpa);

  /* Convert the basic map to basic_set */
  isl_mat *eq_mat = isl_basic_map_equalities_matrix(dep_bmap,
                                                    isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
  isl_mat *ieq_mat = isl_basic_map_inequalities_matrix(dep_bmap,
                                                       isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
  isl_basic_set *dep_bset = isl_basic_set_from_constraint_matrices(
      isl_space_domain(isl_multi_pw_aff_get_space(dep_dis_mpa)),
      eq_mat, ieq_mat,
      isl_dim_set, isl_dim_div, isl_dim_param, isl_dim_cst);

  dep_dis_mpa = isl_multi_pw_aff_intersect_domain(dep_dis_mpa,
                                                  isl_set_from_basic_set(dep_bset));

  is_uniform = isl_multi_pw_aff_is_cst(dep_dis_mpa);

  isl_multi_pw_aff_free(dep_dis_mpa);
  isl_basic_map_free(dep_bmap);
  return is_uniform;
}

/* Examine the dependences in the "map". If any of the dependence is non-uniform,
 * print out the detailed information.
 * Return true if all dependences are uniform.
 */
isl_bool is_dep_uniform_wrap(__isl_keep isl_map *map, void *user)
{
  isl_bool is_uniform;
  isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(map);
  for (int i = 0; i < isl_map_n_basic_map(map); i++)
  {
    is_uniform = is_dep_uniform(isl_basic_map_list_get_basic_map(bmap_list, i), user);
    if (is_uniform != isl_bool_true)
    {
      isl_basic_map *dep_i = isl_basic_map_list_get_basic_map(bmap_list, i);
      /* Print out the non-uniform dependence. */
      isl_printer *p = isl_printer_to_file(isl_map_get_ctx(map), stdout);
      p = isl_printer_print_basic_map(p, dep_i);
      printf("\n");
      isl_printer_free(p);
      isl_basic_map_free(dep_i);

      isl_basic_map_list_free(bmap_list);
      return isl_bool_false;
    }
  }
  isl_basic_map_list_free(bmap_list);
  return isl_bool_true;
}

/* Examine if all flow and RAR dependences are uniform in the program. */
isl_bool uniform_dep_check(__isl_keep isl_schedule *schedule, struct ppcg_scop *scop)
{
  isl_union_map *dep_rar = scop->dep_rar;
  //DBGUMAP(stdout, dep_rar, isl_schedule_get_ctx(schedule));

  isl_union_map *dep_flow = scop->dep_flow;

  isl_bool all_flow_dep_uniform = isl_union_map_every_map(dep_flow, &is_dep_uniform_wrap, schedule);
  if (all_flow_dep_uniform != isl_bool_true)
    return isl_bool_false;

  isl_bool all_rar_dep_uniform = isl_union_map_every_map(dep_rar, &is_dep_uniform_wrap, schedule);
  if (all_rar_dep_uniform != isl_bool_true)
    return isl_bool_false;

  return isl_bool_true;
}

/* Set *depth (initialized to 0 by the caller) to the maximum
 * of the schedule depths of the leaf nodes for which this function is called.
 */
static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
{
  int *depth = (int *)user;
  int node_depth;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
    return isl_bool_true;
  node_depth = isl_schedule_node_get_schedule_depth(node);
  if (node_depth > *depth)
    *depth = node_depth;

  return isl_bool_false;
}

/* Compute the dependence distance of dependence "dep" under the schedule "schedule".
 */
__isl_give isl_vec *get_dep_dis_at_schedule(__isl_keep isl_basic_map *dep,
                                            __isl_keep isl_schedule *schedule)
{
  isl_schedule_node *root = isl_schedule_get_root(schedule);
  isl_ctx *ctx = isl_basic_map_get_ctx(dep);
  isl_union_map *full_sched = isl_schedule_node_get_subtree_schedule_union_map(root);
  isl_schedule_node_free(root);

  /* Extract the iterator num. */
  int iter_num = 0;
  isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, &iter_num);

  isl_union_map *dep_sched = isl_union_map_apply_domain(isl_union_map_from_map(isl_map_from_basic_map(isl_basic_map_copy(dep))),
                                                        isl_union_map_copy(full_sched));
  dep_sched = isl_union_map_apply_range(dep_sched, full_sched);

  isl_map *dep_map = isl_map_from_union_map(dep_sched);
  isl_basic_map *dep_bmap = isl_basic_map_from_map(isl_map_copy(dep_map));

  isl_set *src_dep_domain = isl_map_domain(isl_map_copy(dep_map));
  isl_map *src_dep_domain_map = isl_set_identity(src_dep_domain);
  isl_multi_pw_aff *src_mpa = isl_multi_pw_aff_identity(isl_map_get_space(src_dep_domain_map));
  isl_map_free(src_dep_domain_map);

  isl_set *dest_dep_domain = isl_map_range(dep_map);
  isl_map *dest_dep_domain_map = isl_set_identity(dest_dep_domain);
  isl_multi_pw_aff *dest_mpa = isl_multi_pw_aff_identity(isl_map_get_space(dest_dep_domain_map));
  isl_map_free(dest_dep_domain_map);

  /* Add dims. */
  isl_size src_dim = isl_multi_pw_aff_dim(src_mpa, isl_dim_in);
  isl_size dest_dim = isl_multi_pw_aff_dim(dest_mpa, isl_dim_in);
  src_mpa = isl_multi_pw_aff_insert_dims(src_mpa, isl_dim_in, src_dim, dest_dim);
  dest_mpa = isl_multi_pw_aff_insert_dims(dest_mpa, isl_dim_in, 0, src_dim);

  isl_multi_pw_aff *dep_dis_mpa = isl_multi_pw_aff_sub(dest_mpa, src_mpa);

  /* Convert the basic map to basic_set. */
  isl_mat *eq_mat = isl_basic_map_equalities_matrix(dep_bmap,
                                                    isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
  isl_mat *ieq_mat = isl_basic_map_inequalities_matrix(dep_bmap,
                                                       isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
  isl_basic_set *dep_bset = isl_basic_set_from_constraint_matrices(
      isl_space_domain(isl_multi_pw_aff_get_space(dep_dis_mpa)),
      eq_mat, ieq_mat,
      isl_dim_set, isl_dim_div, isl_dim_param, isl_dim_cst);

  dep_dis_mpa = isl_multi_pw_aff_intersect_domain(dep_dis_mpa,
                                                  isl_set_from_basic_set(isl_basic_set_copy(dep_bset)));
  isl_space *space = isl_multi_pw_aff_get_space(dep_dis_mpa);
  isl_vec *dep_dis = isl_vec_zero(ctx, isl_space_dim(space, isl_dim_out));
  for (int i = 0; i < isl_vec_size(dep_dis); i++)
  {
    isl_pw_aff *pa = isl_multi_pw_aff_get_pw_aff(dep_dis_mpa, i);
    isl_val *val = isl_pw_aff_eval(pa, isl_basic_set_sample_point(isl_basic_set_copy(dep_bset)));
    dep_dis = isl_vec_set_element_val(dep_dis, i, val);
  }

  isl_space_free(space);
  isl_basic_set_free(dep_bset);
  isl_basic_map_free(dep_bmap);
  isl_multi_pw_aff_free(dep_dis_mpa);

  return dep_dis;
}

/* Compute the dependence distance vector of the dependence under the 
 * partial schedule of the band node. The dependence "dep" is untagged.
 */
__isl_give isl_vec *get_dep_dis_at_node(__isl_keep isl_basic_map *dep, __isl_keep isl_schedule_node *band)
{
  if (isl_schedule_node_get_type(band) != isl_schedule_node_band)
    return NULL;

  isl_multi_union_pw_aff *p_sc = isl_schedule_node_band_get_partial_schedule(band);
  isl_union_pw_multi_aff *contraction = isl_schedule_node_get_subtree_contraction(band);
  p_sc = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(p_sc, contraction);

  int band_w = isl_schedule_node_band_n_member(band);
  isl_vec *dep_dis = isl_vec_zero(isl_basic_map_get_ctx(dep), band_w);
  for (int i = 0; i < band_w; i++)
  {
    isl_union_pw_aff *p_sc_hyp = isl_multi_union_pw_aff_get_union_pw_aff(p_sc, i);
    /* Obtain the schedule for the src statement. */
    isl_space *space = isl_basic_map_get_space(dep);
    isl_space *src_space = isl_space_domain(isl_space_copy(space));
    isl_space *dest_space = isl_space_range(space);

    isl_pw_aff *src_sc = NULL;
    isl_pw_aff_list *p_sc_hyp_list = isl_union_pw_aff_get_pw_aff_list(p_sc_hyp);
    for (int j = 0; j < isl_union_pw_aff_n_pw_aff(p_sc_hyp); j++)
    {
      isl_pw_aff *single_sc = isl_pw_aff_list_get_pw_aff(p_sc_hyp_list, j);
      isl_space *single_sc_stmt = isl_space_domain(isl_pw_aff_get_space(single_sc));

      if (isl_space_is_equal(src_space, single_sc_stmt))
      {
        isl_space_free(single_sc_stmt);
        src_sc = single_sc;
        break;
      }
      isl_pw_aff_free(single_sc);
      isl_space_free(single_sc_stmt);
    }
    isl_pw_aff_list_free(p_sc_hyp_list);
    isl_space_free(src_space);

    /* Obtain the schedule for the dest statement. */
    isl_pw_aff *dest_sc = NULL;
    p_sc_hyp_list = isl_union_pw_aff_get_pw_aff_list(p_sc_hyp);
    for (int j = 0; j < isl_union_pw_aff_n_pw_aff(p_sc_hyp); j++)
    {
      isl_pw_aff *single_sc = isl_pw_aff_list_get_pw_aff(p_sc_hyp_list, j);
      isl_space *single_sc_stmt = isl_space_domain(isl_pw_aff_get_space(single_sc));

      if (isl_space_is_equal(dest_space, single_sc_stmt))
      {
        isl_space_free(single_sc_stmt);
        dest_sc = single_sc;
        break;
      }
      isl_pw_aff_free(single_sc);
      isl_space_free(single_sc_stmt);
    }
    isl_pw_aff_list_free(p_sc_hyp_list);
    isl_space_free(dest_space);

    /* Compute the dependence distance at the current hyperplane. */
    /* Step 1: Extend the scheduling function. */
    isl_size src_sc_dim = isl_pw_aff_dim(src_sc, isl_dim_in);
    isl_size dest_sc_dim = isl_pw_aff_dim(dest_sc, isl_dim_in);
    src_sc = isl_pw_aff_insert_dims(src_sc, isl_dim_in, src_sc_dim, dest_sc_dim);
    dest_sc = isl_pw_aff_insert_dims(dest_sc, isl_dim_in, 0, src_sc_dim);
    for (int j = 0; j < dest_sc_dim; j++)
    {
      isl_pw_aff_set_dim_id(src_sc, isl_dim_in, src_sc_dim + j, isl_pw_aff_get_dim_id(dest_sc, isl_dim_in, src_sc_dim + j));
    }
    for (int j = 0; j < src_sc_dim; j++)
    {
      isl_pw_aff_set_dim_id(dest_sc, isl_dim_in, j, isl_pw_aff_get_dim_id(src_sc, isl_dim_in, j));
    }

    isl_pw_aff *dis_sc = isl_pw_aff_sub(dest_sc, src_sc);

    /* Step 2: Convert the basic_map into basic_set. */
    isl_mat *eq_mat = isl_basic_map_equalities_matrix(dep,
                                                      isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);
    isl_mat *ieq_mat = isl_basic_map_inequalities_matrix(dep,
                                                         isl_dim_in, isl_dim_out, isl_dim_div, isl_dim_param, isl_dim_cst);

    isl_basic_set *dep_set = isl_basic_set_from_constraint_matrices(
        isl_space_domain(isl_pw_aff_get_space(dis_sc)),
        eq_mat, ieq_mat,
        isl_dim_set, isl_dim_div, isl_dim_param, isl_dim_cst);

    /* Step 3: Intersect the scheduling function with the domain. */
    isl_pw_aff *dis = isl_pw_aff_intersect_domain(dis_sc, isl_set_from_basic_set(isl_basic_set_copy(dep_set)));
    isl_val *val = isl_pw_aff_eval(dis, isl_basic_set_sample_point(dep_set));
    dep_dis = isl_vec_set_element_val(dep_dis, i, val);

    isl_union_pw_aff_free(p_sc_hyp);
  }

  isl_multi_union_pw_aff_free(p_sc);
  return dep_dis;
}

/* Interchange the loop at "level1" and "level2" in the schedule node and 
 * return the new schedule. */
__isl_give isl_schedule_node *loop_interchange_at_node(
  __isl_take isl_schedule_node *node, isl_size level1, isl_size level2)
{
  /* Obtain the partial schedule of the node. */
  isl_multi_union_pw_aff *sc = isl_schedule_node_band_get_partial_schedule(node);

  /* Exchange the schedule at level1 and level2. */
  isl_multi_union_pw_aff *new_sc = isl_multi_union_pw_aff_copy(sc);
  new_sc = isl_multi_union_pw_aff_set_union_pw_aff(new_sc, level1, isl_multi_union_pw_aff_get_union_pw_aff(sc, level2));
  new_sc = isl_multi_union_pw_aff_set_union_pw_aff(new_sc, level2, isl_multi_union_pw_aff_get_union_pw_aff(sc, level1));

  /* Insert a new schedule node with the new schedule. */
  struct autosa_node_band_prop *prop = extract_node_band_prop(node);
  node = isl_schedule_node_insert_partial_schedule(node, new_sc);

  /* Update the properties of the new node. */
  node = isl_schedule_node_band_set_permutable(node, 1);
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
  {
    node = isl_schedule_node_band_member_set_coincident(node, i, prop->coincident[i]);
  }
  node = isl_schedule_node_band_member_set_coincident(node, level1, prop->coincident[level2]);
  node = isl_schedule_node_band_member_set_coincident(node, level2, prop->coincident[level1]);
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
  {
    node = isl_schedule_node_band_member_set_pe_opt(node, i, prop->pe_opt[i]);
  }
  node = isl_schedule_node_band_member_set_pe_opt(node, level1, prop->pe_opt[level2]);
  node = isl_schedule_node_band_member_set_pe_opt(node, level2, prop->pe_opt[level1]);

  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
  {
    node = isl_schedule_node_band_member_set_space_time(node, i, prop->space_time[i]);
  }
  node = isl_schedule_node_band_member_set_space_time(node, level1, prop->space_time[level2]);
  node = isl_schedule_node_band_member_set_space_time(node, level2, prop->space_time[level1]);

  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
  {
    node = isl_schedule_node_band_member_set_sched_pos(node, i, prop->sched_pos[i]);
  }
  node = isl_schedule_node_band_member_set_sched_pos(node, level1, prop->sched_pos[level2]);
  node = isl_schedule_node_band_member_set_sched_pos(node, level2, prop->sched_pos[level1]);
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) 
  {
    node = isl_schedule_node_band_member_set_iter(node, i, prop->iter[i]);    
  }
  node = isl_schedule_node_band_member_set_iter(node, level1, prop->iter[level2]);
  node = isl_schedule_node_band_member_set_iter(node, level2, prop->iter[level1]);

  autosa_node_band_prop_free(prop);

  /* Delete the old node after the current node */
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_delete(node);

  node = isl_schedule_node_parent(node);
  isl_multi_union_pw_aff_free(sc);
  
  return node;

//  /* Obtain the schedule from the schedule node. */
//  isl_schedule *schedule = isl_schedule_node_get_schedule(node);
//
//  isl_schedule_node_free(node);
//  isl_multi_union_pw_aff_free(sc);
//
//  return schedule;
}

/* Examine if the node is a permutable band node. If so,
 * since the schedule tree is visited top-down,
 * return such a node immediately.
 */
static isl_bool is_outermost_permutable_node_update(
    __isl_keep isl_schedule_node *node, void *user)
{
  isl_schedule_node **t_node = (isl_schedule_node **)(user);
  if (!node)
    return isl_bool_error;

  if (is_permutable_node(node) == isl_bool_true)
  {
    *t_node = isl_schedule_node_copy(node);
    return isl_bool_false;
  }
  else
  {
    return isl_bool_true;
  }

  return isl_bool_true;
}

/* Extract the outermost permutable band node from the schedule tree.
 * When there are multiple nodes at the same level, extract the first one.
 */
__isl_give isl_schedule_node *get_outermost_permutable_node(
    __isl_keep isl_schedule *schedule)
{
  isl_schedule_node *root = isl_schedule_get_root(schedule);
  isl_schedule_node *t_node = NULL;
  isl_schedule_node_foreach_descendant_top_down(root,
                                                &is_outermost_permutable_node_update, &t_node);

  isl_schedule_node_free(root);
  return t_node;
}

/* Examines if the node is a permutable band node. If so,
 * since the schedule tree is visited bottom-up,
 * return the node immediately.
 */
static isl_bool is_innermost_permutable_node_update(__isl_keep isl_schedule_node *node, void *user)
{
  isl_schedule_node **t_node = (isl_schedule_node **)(user);
  if (!node)
    return isl_bool_error;

  if (is_permutable_node(node) == isl_bool_true)
  {
    /* Check if there is any other band below it. */
    isl_schedule_node *new_node = isl_schedule_node_get_child(node, 0);
    isl_bool no_inner_band = isl_schedule_node_every_descendant(new_node,
                                                                &no_permutable_node, NULL);
    if (no_inner_band)
    {
      if (*t_node == NULL)
        *t_node = isl_schedule_node_copy(node);
    }
    isl_schedule_node_free(new_node);
  }

  return isl_bool_true;
}

/* Extract the innermost permutable band node from the schedule tree.
 * When there are multiple nodes at the same level, extract the first one.
 */
__isl_give isl_schedule_node *get_innermost_permutable_node(__isl_keep isl_schedule *schedule)
{
  isl_schedule_node *root = isl_schedule_get_root(schedule);
  isl_schedule_node *t_node = NULL;
  isl_schedule_node_foreach_descendant_top_down(root,
                                                &is_innermost_permutable_node_update, &t_node);

  isl_schedule_node_free(root);
  return t_node;
}

/* Tile "band" with tile size specified by "sizes".
 */
__isl_give isl_schedule_node *tile_band(
    __isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
{
  isl_ctx *ctx = isl_schedule_node_get_ctx(node);
  int scale_tile;
  int shift_point;

  scale_tile = isl_options_get_tile_scale_tile_loops(ctx);
  isl_options_set_tile_scale_tile_loops(ctx, 0);
  shift_point = isl_options_get_tile_shift_point_loops(ctx);
  isl_options_set_tile_shift_point_loops(ctx, 1);

  node = isl_schedule_node_band_tile(node, sizes);

  isl_options_set_tile_scale_tile_loops(ctx, scale_tile);
  isl_options_set_tile_shift_point_loops(ctx, shift_point);

  return node;
}

/* Tile "band" with tile size specified by "sizes".
 *
 * If the tile size at the given position, is "-1", the loop
 * will not be tiled. Two band nodes are generated. The first band
 * contains the tile loops and the untiled loops. The second band
 * contains the point loops.
 */
__isl_give isl_schedule_node *autosa_tile_band(
    __isl_take isl_schedule_node *node, __isl_keep int *sizes)
{
  int full_tile = 1;
  int n;

  /* Examine of the band needs to be completedly tiled. */
  n = isl_schedule_node_band_n_member(node);
  for (int i = 0; i < n; i++)
  {
    if (sizes[i] == -1)
    {
      full_tile = 0;
      break;
    }
  }

  if (full_tile)
  {
    isl_multi_val *tile_sizes;
    tile_sizes = construct_band_tile_sizes(node, sizes);
    node = tile_band(node, isl_multi_val_copy(tile_sizes));
    /* Reset the space_time in the tile band */
    for (int i = 0; i < n; i++)
    {
      node = isl_schedule_node_band_member_set_space_time(node, i, autosa_loop_time);
    }
    isl_multi_val_free(tile_sizes);
  }
  else
  {
    // TODO: tile on demand
    isl_die(isl_schedule_node_get_ctx(node), isl_error_unsupported,
            "on-demand tiling not supported", return node);
  }

  return node;
}

/* Given two nested nodes,
 * N1
 * |
 * N2
 * Merge them into one node.
 * N
 * The input "node" points to N1.
 * Return a pointer to N.
 */
static __isl_give isl_schedule_node *autosa_node_merge(
    __isl_take isl_schedule_node *node)
{
  if (isl_schedule_node_n_children(node) == 0 || isl_schedule_node_n_children(node) > 1)
    return node;

  isl_schedule_node *parent = node;
  isl_schedule_node *child = isl_schedule_node_child(isl_schedule_node_copy(node), 0);
  if (isl_schedule_node_get_type(parent) != isl_schedule_node_band ||
      isl_schedule_node_get_type(child) != isl_schedule_node_band)
    return node;

  /* Save the node properties. */
  struct autosa_node_band_prop *parent_prop = extract_node_band_prop(parent);
  struct autosa_node_band_prop *child_prop = extract_node_band_prop(child);

  /* Merge the partial schedules of two nodes. */
  isl_union_pw_aff_list *upa_list = isl_union_pw_aff_list_alloc(
      isl_schedule_node_get_ctx(node), 0);
  isl_space *parent_space = isl_multi_union_pw_aff_get_space(parent_prop->mupa);
  isl_space *child_space = isl_multi_union_pw_aff_get_space(child_prop->mupa);

  for (int i = 0; i < parent_prop->n_member; i++)
  {
    isl_union_pw_aff *upa = isl_multi_union_pw_aff_get_union_pw_aff(parent_prop->mupa, i);
    upa_list = isl_union_pw_aff_list_add(
        upa_list, upa);
  }
  for (int i = 0; i < child_prop->n_member; i++)
  {
    isl_union_pw_aff *upa = isl_multi_union_pw_aff_get_union_pw_aff(child_prop->mupa, i);
    upa_list = isl_union_pw_aff_list_add(
        upa_list, upa);
  }

  isl_space *mupa_space = isl_space_add_dims(parent_space, isl_dim_set, isl_space_dim(child_space, isl_dim_set));
  isl_space_free(child_space);

  isl_multi_union_pw_aff *mupa = isl_multi_union_pw_aff_from_union_pw_aff_list(
      mupa_space,
      upa_list);

  /* Insert one new node. */
  node = isl_schedule_node_insert_partial_schedule(node, mupa);

  /* Restore the node properties. */
  node = isl_schedule_node_band_set_permutable(node, 1);
  for (int i = 0; i < parent_prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_coincident(
        node, i, parent_prop->coincident[i]);
  }
  for (int i = 0; i < parent_prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_space_time(
        node, i, parent_prop->space_time[i]);
    node = isl_schedule_node_band_member_set_pe_opt(
        node, i, parent_prop->pe_opt[i]);
    node = isl_schedule_node_band_member_set_sched_pos(
        node, i, parent_prop->sched_pos[i]);
    node = isl_schedule_node_band_member_set_iter(
        node, i, parent_prop->iter[i]);
  }
  for (int i = 0; i < child_prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_coincident(
        node, i + parent_prop->n_member, child_prop->coincident[i]);
  }
  for (int i = 0; i < child_prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_space_time(
        node, i + parent_prop->n_member, child_prop->space_time[i]);
    node = isl_schedule_node_band_member_set_pe_opt(
        node, i + parent_prop->n_member, child_prop->pe_opt[i]);
    node = isl_schedule_node_band_member_set_sched_pos(
        node, i + parent_prop->n_member, child_prop->sched_pos[i]);
    node = isl_schedule_node_band_member_set_iter(
        node, i + parent_prop->n_member, child_prop->iter[i]);
  }

  /* Delete the old nodes. */
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_delete(node);
  node = isl_schedule_node_delete(node);
  node = isl_schedule_node_parent(node);

  free(parent_prop->coincident);
  free(parent_prop->pe_opt);
  free(parent_prop->space_time);
  free(parent_prop->sched_pos);  
  isl_multi_union_pw_aff_free(parent_prop->mupa);
  free(parent_prop);
  free(child_prop->coincident);
  free(child_prop->pe_opt);
  free(child_prop->space_time);  
  free(child_prop->sched_pos);  
  isl_multi_union_pw_aff_free(child_prop->mupa);
  free(child_prop);
  isl_schedule_node_free(child);

  return node;
}

/* Tile the loop at the "pos" position of the band with the size "tile_size".
 * The original band
 * B
 * is first splitted to
 * B1
 * |
 * p
 * |
 * B2
 * The loop p is then tiled, and four band nodes are generated.
 * B1
 * |
 * p_tile
 * |
 * B2
 * |
 * p_point
 * The first three bands are then merged together.
 * B'
 * |
 * p_point
 * A pointer to B' is returned.
 */
__isl_give isl_schedule_node *autosa_node_band_tile_loop(
    __isl_take isl_schedule_node *node, int tile_size, int pos)
{
  isl_multi_val *tile_sizes;
  int n = isl_schedule_node_band_n_member(node);
  int size[1];

  size[0] = tile_size;
  node = isl_schedule_node_band_split(node, pos);
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_band_split(node, 1);

  tile_sizes = construct_band_tile_sizes(node, size);
  node = tile_band(node, isl_multi_val_copy(tile_sizes));
  isl_multi_val_free(tile_sizes);

  /* Swap the order of the point band and the next band. */
  node = isl_schedule_node_child(node, 0);
  node = autosa_node_interchange(node);

  /* Merge the first three bands. */
  node = isl_schedule_node_parent(node);
  node = autosa_node_merge(node);
  node = isl_schedule_node_parent(node);
  node = autosa_node_merge(node);

  return node;
}

/* Reset the pe_opt properties of all the band opts back to default. */
__isl_give isl_schedule_node *clear_pe_opt_prop(
    __isl_take isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
    {
      node = isl_schedule_node_band_member_set_pe_opt(node, i,
                                                      autosa_loop_default);
    }
  }

  return node;
}

/* Extract the partial schedule, restore the rest band node properties from "prop". 
 */
__isl_give isl_schedule_node *restore_node_band_prop(
    __isl_take isl_schedule_node *node,
    __isl_take struct autosa_node_band_prop *prop)
{
  node = isl_schedule_node_band_set_permutable(node, prop->permutable);
  for (int i = 0; i < prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_coincident(node, i, prop->coincident[i]);
  }
  for (int i = 0; i < prop->n_member; i++)
  {
    node = isl_schedule_node_band_member_set_space_time(node, i, prop->space_time[i]);
    node = isl_schedule_node_band_member_set_pe_opt(node, i, prop->pe_opt[i]);
    node = isl_schedule_node_band_member_set_sched_pos(node, i, prop->sched_pos[i]);
    node = isl_schedule_node_band_member_set_iter(node, i, prop->iter[i]);
  }

  free(prop->coincident);
  free(prop->pe_opt);
  free(prop->space_time);
  free(prop->sched_pos);  
  isl_multi_union_pw_aff_free(prop->mupa);
  free(prop);

  return node;
}

/* Given two nested nodes,
 * N1
 * |
 * N2
 * Interchange the two nodes to
 * N2
 * |
 * N1
 * The input "node" points to N1.
 * return a pointer to node N2.
 */
__isl_give isl_schedule_node *autosa_node_interchange(
    __isl_take isl_schedule_node *node)
{
  if (isl_schedule_node_n_children(node) == 0 || isl_schedule_node_n_children(node) > 1)
  {
    return node;
  }

  /* Save the current node. */
  struct autosa_node_band_prop *prop = extract_node_band_prop(node);

  /* Delete the current node. */
  node = isl_schedule_node_delete(node);

  /* Insert the old node. */
  node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_insert_partial_schedule(node,
                                                   isl_multi_union_pw_aff_copy(prop->mupa));

  /* Restore the node properties. */
  node = restore_node_band_prop(node, prop);
  node = isl_schedule_node_parent(node);

  return node;
}

/* Given two nested nodes,
 * N2
 * |
 * N1
 * Interchange the two nodes to
 * N1
 * |
 * N2
 * The input "node" points to N1.
 * Return a pointer to node N1.
 * Besides, currently we only support interchanging band nodes and mark nodes.
 */
__isl_give isl_schedule_node *autosa_node_interchange_up(
    __isl_take isl_schedule_node *node)
{
  enum isl_schedule_node_type t;
  enum isl_schedule_node_type parent_t;
  isl_schedule_node *parent_node;
  struct autosa_node_band_prop *prop;
  isl_id *id;

  if (!isl_schedule_node_has_parent(node))
  {
    return node;
  }
  t = isl_schedule_node_get_type(node);
  if (!(t == isl_schedule_node_band || t == isl_schedule_node_mark))
  {
    isl_die(isl_schedule_node_get_ctx(node), isl_error_unsupported,
            "only band and mark nodes are supported", return node);
  }
  parent_node = isl_schedule_node_parent(isl_schedule_node_copy(node));
  parent_t = isl_schedule_node_get_type(parent_node);
  if (!(parent_t == isl_schedule_node_band || parent_t == isl_schedule_node_mark))
  {
    isl_die(isl_schedule_node_get_ctx(node), isl_error_unsupported,
            "only band and mark nodes are supported", return node);
  }
  isl_schedule_node_free(parent_node);

  /* Save the current node. */
  if (t == isl_schedule_node_band)
  {
    prop = extract_node_band_prop(node);
  }
  else if (t == isl_schedule_node_mark)
  {
    id = isl_schedule_node_mark_get_id(node);
  }

  /* Delete the current node. */
  node = isl_schedule_node_delete(node);

  /* Insert the old node. */
  node = isl_schedule_node_parent(node);
  if (t == isl_schedule_node_band)
  {
    node = isl_schedule_node_insert_partial_schedule(node,
                                                     isl_multi_union_pw_aff_copy(prop->mupa));
    node = restore_node_band_prop(node, prop);
  }
  else if (t == isl_schedule_node_mark)
  {
    node = isl_schedule_node_insert_mark(node, id);
  }

  return node;
}

/* If the "node" is a permutable band node, return false.
 */
isl_bool no_permutable_node(__isl_keep isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    return isl_bool_false;
  else
    return isl_bool_true;
}

/* If any band member is non-parallel, return false. 
 */
isl_bool all_parallel_node(__isl_keep isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    int n = isl_schedule_node_band_n_member(node);
    for (int i = 0; i < n; i++)
    {
      if (!isl_schedule_node_band_member_get_coincident(node, i))
        return isl_bool_false;
    }
  }
  return isl_bool_true;
}

/* This function tests if the loops above the "array" mark carry any flow
 * dependence that is assoicated with the I/O group "group".
 */
isl_bool is_flow_dep_carried_by_array_part_loops(__isl_keep isl_schedule *schedule,
                                                 struct autosa_array_ref_group *group, struct autosa_kernel *kernel)
{
  isl_bool carried = isl_bool_false;
  isl_schedule_node *node;
  isl_union_map *umap;

  if (!group->local_array->array_type == AUTOSA_INT_ARRAY)
    return carried;
  node = isl_schedule_get_root(schedule);
  node = autosa_tree_move_down_to_array(node, kernel->core);
  while (node && isl_schedule_node_has_parent(node))
  {
    if (autosa_tree_node_is_kernel(node))
      break;
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
      umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
      for (int i = 0; i < group->n_ref; i++)
      {
        struct autosa_stmt_access *ref = group->refs[i];
        for (int j = 0; j < ref->n_io_info; j++)
        {
          struct autosa_io_info *io_info = ref->io_info[j];
          if (io_info->io_type == group->io_type &&
              !isl_vec_cmp(io_info->dir, group->dir))
          {
            isl_map *test;
            isl_map *schedule_dep;
            int dim;
            int is_parallel;

            isl_union_map *dep = isl_union_map_from_map(
                isl_map_factor_domain(
                    isl_map_from_basic_map(isl_basic_map_copy(io_info->dep->isl_dep))));
            dep = isl_union_map_apply_range(dep, isl_union_map_copy(umap));
            dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap));
            if (isl_union_map_is_empty(dep))
            {
              isl_union_map_free(dep);
              break;
            }
            schedule_dep = isl_map_from_union_map(dep);
            test = isl_map_universe(isl_map_get_space(schedule_dep));
            dim = isl_schedule_node_band_n_member(node);
            for (int n = 0; n < dim; n++)
            {
              test = isl_map_equate(test, isl_dim_in, n, isl_dim_out, n);
            }
            is_parallel = isl_map_is_subset(schedule_dep, test);
            isl_map_free(schedule_dep);
            isl_map_free(test);

            if (!is_parallel)
            {
              /* Dependence is carried by the array part loops. */
              carried = isl_bool_true;
              break;
            }
          }
        }
      }
      isl_union_map_free(umap);
    }
    node = isl_schedule_node_parent(node);
  }

  isl_schedule_node_free(node);
  return carried;
}

/* Test if the dependence is carried by the current schedule node. */
int is_dep_carried_by_node(__isl_keep isl_basic_map *dep, __isl_keep isl_schedule_node *node)
{
  if (!node || isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return -1;
  if (isl_schedule_node_band_n_member(node) != 1)
    return -1;
  if (!dep)
    return -1;

  isl_union_map *umap, *umap_dep;
  isl_map *map_dep, *test;
  int is_carried;

  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  umap_dep = isl_union_map_from_map(isl_map_factor_domain(isl_map_from_basic_map(isl_basic_map_copy(dep))));
  umap_dep = isl_union_map_apply_range(umap_dep, isl_union_map_copy(umap));
  umap_dep = isl_union_map_apply_domain(umap_dep, umap);
  if (isl_union_map_is_empty(umap_dep)) {
    isl_union_map_free(umap_dep);
    return -1;
  }
  map_dep = isl_map_from_union_map(umap_dep);
  test = isl_map_universe(isl_map_get_space(map_dep));
  test = isl_map_equate(test, isl_dim_in, 0, isl_dim_out, 0);
  is_carried = !isl_map_is_subset(map_dep, test);
  isl_map_free(map_dep);
  isl_map_free(test);
  
  return is_carried;
}

struct insert_node_at_depth_data {
  isl_multi_union_pw_aff *mupa;
  struct autosa_node_band_prop *prop;
  int depth;
};

static isl_bool has_inserted_mark(__isl_keep isl_schedule_node *node, void *user)
{
  if (is_marked(node, "inserted"))
    return isl_bool_false;
  
  return isl_bool_true;
}

static __isl_give isl_schedule_node *delete_inserted_mark(__isl_take isl_schedule_node *node, void *user)
{
  if (is_marked(node, "inserted"))
    node = isl_schedule_node_delete(node);
  
  return node;
}

static isl_bool has_band_node(__isl_keep isl_schedule_node *node, void *user)
{
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)    
    return isl_bool_false;
  
  return isl_bool_true;
}

/* Insert the node at the "depth" position. To prevent inserting the node 
 * multiple times, a "inserted" mark will be inserted before the node.
 * After the insertion, we will delete this "inserted" mark.
 * This function is not complete, might have bugs.
 */
static __isl_give isl_schedule_node *insert_node_at_depth(
  __isl_take isl_schedule_node *node, void *user)
{
  struct insert_node_at_depth_data *data = (struct insert_node_at_depth_data *)user;
  isl_id *id;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;

  /* Examine the subtree contains the "inserted" mark node */
  if (!isl_schedule_node_every_descendant(node, &has_inserted_mark, NULL)) {    
    return node;
  }

  if (isl_schedule_node_get_schedule_depth(node) < data->depth) {
    /* Split the node and insert at certain position. However, 
     * currently, we simply put it below the cureretn node.
     * TODO: fix it
     */
    node = isl_schedule_node_child(node, 0);
  }

  if (isl_schedule_node_get_schedule_depth(node) != data->depth) {
//#ifdef _DEBUG
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif        
    return node;
  }

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  /* Check if the node is right under the "latency" node.
   * If true, move the node to the mark node.
   */
  node = isl_schedule_node_parent(node);
  if (!is_marked(node, "latency"))
    node = isl_schedule_node_child(node, 0);
  node = isl_schedule_node_parent(node);
  if (!is_marked(node, "simd"))
    node = isl_schedule_node_child(node, 0);

  /* Insert the node at current position */
  node = isl_schedule_node_insert_partial_schedule(node, isl_multi_union_pw_aff_copy(data->mupa));
  node = isl_schedule_node_band_set_permutable(node, data->prop->permutable);
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) {
    node = isl_schedule_node_band_member_set_coincident(node, i, data->prop->coincident[i]);
    node = isl_schedule_node_band_member_set_pe_opt(node, i, data->prop->pe_opt[i]);
    node = isl_schedule_node_band_member_set_space_time(node, i, data->prop->space_time[i]);
    node = isl_schedule_node_band_member_set_sched_pos(node, i, data->prop->sched_pos[i]);
    node = isl_schedule_node_band_member_set_iter(node, i, data->prop->iter[i]);
  }

  /* Insert a "inserted" mark */
  id = isl_id_alloc(isl_schedule_node_get_ctx(node), "inserted", NULL);
  node = isl_schedule_node_insert_mark(node, id);

//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif

  return node;
}

/* This function sinks the node to the schedule depth "depth". */
__isl_give isl_schedule_node *autosa_node_sink_to_depth(
  __isl_take isl_schedule_node *node, int depth)
{
  isl_multi_union_pw_aff *mupa;
  struct autosa_node_band_prop *prop;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;
  
  mupa = isl_schedule_node_band_get_partial_schedule(node);
  prop = extract_node_band_prop(node);
  /* Delete the current node */
  node = isl_schedule_node_delete(node);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif  
  struct insert_node_at_depth_data data = {mupa, prop, depth};
  node = isl_schedule_node_map_descendant_bottom_up(node, &insert_node_at_depth, &data);
//#ifdef _DEBUG
//  DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));
//#endif
  /* Delete the inserted mark */
  node = isl_schedule_node_map_descendant_bottom_up(node, &delete_inserted_mark, NULL);

  autosa_node_band_prop_free(prop);
  isl_multi_union_pw_aff_free(mupa);

  return node;
}

struct sink_node_to_mark_data {
  isl_multi_union_pw_aff *mupa;
  struct autosa_node_band_prop *prop;
  const char *name;  
  bool inserted;
};

static __isl_give isl_schedule_node *sink_node_to_mark(
  __isl_take isl_schedule_node *node, void *user)
{
  struct sink_node_to_mark_data *data = (struct sink_node_to_mark_data *)user;
  isl_id *id;
  isl_schedule_node *node_tmp;  

  //if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
  //  return node;
  
  /* Examine the subtree contains the "inserted" mark node */
  if (!isl_schedule_node_every_descendant(node, &has_inserted_mark, NULL)) {    
    return node;
  }

  //DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node));

  if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
    /* If this is a band node, then insert it under the band node. */
    node = isl_schedule_node_child(node, 0);
  } else if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf) {
    /* If this is a leaf node, check:
     * 1. There is a band node in the parent tree.
     * 2. There is a sequence node, and there is no bands under any children.
     * If the above criteria meet, we will skip this node because we will insert the node in the other positions. 
     */    
    bool insert = 1;
    node_tmp = isl_schedule_node_copy(node);
    //DBGSCHDNODE(stdout, node_tmp, isl_schedule_node_get_ctx(node_tmp));
    while (!autosa_tree_node_is_mark(node_tmp, "stop") && isl_schedule_node_has_parent(node_tmp)) {
      node_tmp = isl_schedule_node_parent(node_tmp);
      if (isl_schedule_node_get_type(node_tmp) == isl_schedule_node_band) {
        insert = 0;        
        break;
      }
      if (isl_schedule_node_get_type(node_tmp) == isl_schedule_node_sequence) {
        // TODO: We haven't considered other nodes such as set yet.
        int n_child = 0;
        for (n_child = 0; n_child < isl_schedule_node_n_children(node_tmp); n_child++) {
          isl_schedule_node *node_child = isl_schedule_node_child(isl_schedule_node_copy(node_tmp), n_child);
          /* Check if there is any band node under this child node. */
          if (!isl_schedule_node_every_descendant(node_child, &has_band_node, NULL)) {                        
            isl_schedule_node_free(node_child);
            break;
          }          
          isl_schedule_node_free(node_child);
        }
        if (n_child == isl_schedule_node_n_children(node_tmp)) {
          insert = 0;          
          break;
        }        
      } 
    }    
    isl_schedule_node_free(node_tmp);
    if (insert == 0)
      return node;
  } else {
    return node;
  }

  //node = isl_schedule_node_child(node, 0);
  /* Check if the node is under any exisiting "name" node.
   * If true, move the node to the mark node.
   */
  int mark_cnt = 0;
  node_tmp = isl_schedule_node_copy(node);
  while (isl_schedule_node_has_parent(node_tmp)) {
    node_tmp = isl_schedule_node_parent(node_tmp);
    if (is_marked(node_tmp, data->name))
      mark_cnt++;
  }
  isl_schedule_node_free(node_tmp);
  
  while (mark_cnt > 0) {
    node = isl_schedule_node_parent(node);
    if (is_marked(node, data->name))
      mark_cnt--;
  }

  /* Insert the node at current position */
  node = isl_schedule_node_insert_partial_schedule(node, isl_multi_union_pw_aff_copy(data->mupa));
  node = isl_schedule_node_band_set_permutable(node, data->prop->permutable);
  for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) {
    node = isl_schedule_node_band_member_set_coincident(node, i, data->prop->coincident[i]);
    node = isl_schedule_node_band_member_set_pe_opt(node, i, data->prop->pe_opt[i]);
    node = isl_schedule_node_band_member_set_space_time(node, i, data->prop->space_time[i]);
    node = isl_schedule_node_band_member_set_sched_pos(node, i, data->prop->sched_pos[i]);
    node = isl_schedule_node_band_member_set_iter(node, i, data->prop->iter[i]);
  }

  /* Insert a "name" mark */
  id = isl_id_alloc(isl_schedule_node_get_ctx(node), data->name, NULL);
  node = isl_schedule_node_insert_mark(node, id);

  /* Insert a "inserted" mark */
  id = isl_id_alloc(isl_schedule_node_get_ctx(node), "inserted", NULL);
  node = isl_schedule_node_insert_mark(node, id);
  
  data->inserted = true;

  return node;
}

/* Sink the node innermost, but above the mark name with "name" if set. */
__isl_give isl_schedule_node *autosa_node_sink_to_mark(
  __isl_take isl_schedule_node *node, const char *name)
{
  isl_multi_union_pw_aff *mupa;
  struct autosa_node_band_prop *prop;
  isl_id *id;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;

  /* Insert a stop mark. */
  id = isl_id_alloc(isl_schedule_node_get_ctx(node), "stop", NULL);
  node = isl_schedule_node_insert_mark(node, id);
  node = isl_schedule_node_child(node, 0);

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  prop = extract_node_band_prop(node);
  /* Delete the current node */
  node = isl_schedule_node_delete(node);

  struct sink_node_to_mark_data data = {mupa, prop, name, false};
  node = isl_schedule_node_map_descendant_bottom_up(node, &sink_node_to_mark, &data);
  if (!data.inserted) {
    
    /* Insert the node at current position */
    node = isl_schedule_node_insert_partial_schedule(node, isl_multi_union_pw_aff_copy(data.mupa));
    node = isl_schedule_node_band_set_permutable(node, data.prop->permutable);
    for (int i = 0; i < isl_schedule_node_band_n_member(node); i++) {
      node = isl_schedule_node_band_member_set_coincident(node, i, data.prop->coincident[i]);
      node = isl_schedule_node_band_member_set_pe_opt(node, i, data.prop->pe_opt[i]);
      node = isl_schedule_node_band_member_set_space_time(node, i, data.prop->space_time[i]);
      node = isl_schedule_node_band_member_set_sched_pos(node, i, data.prop->sched_pos[i]);
      node = isl_schedule_node_band_member_set_iter(node, i, data.prop->iter[i]);
    }

    /* Insert a "name" mark */
    id = isl_id_alloc(isl_schedule_node_get_ctx(node), data.name, NULL);
    node = isl_schedule_node_insert_mark(node, id);
  }
  /* Delete the "inserted" mark */
  node = isl_schedule_node_map_descendant_bottom_up(node, &delete_inserted_mark, NULL);
  
  /* Delete the stop mark */
  node = isl_schedule_node_parent(node);
  node = isl_schedule_node_delete(node);

  autosa_node_band_prop_free(prop);
  isl_multi_union_pw_aff_free(mupa);

  return node;
}

/* Reorder the schedule dims in the band based on the dependence distance.
 */
__isl_give isl_schedule_node *reorder_band_by_dep_dis(__isl_take isl_schedule_node *node)
{
  int n = isl_schedule_node_band_n_member(node);
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      int sched_pos = isl_schedule_node_band_member_get_sched_pos(node, j);
      if (sched_pos == i) {
        /* Permute the j-th dim to i-th dim */
        node = loop_interchange_at_node(node, j, i);
      }
    }
  }

  return node;
}

static __isl_give isl_schedule_node *band_sched_pos_setup(
  __isl_take isl_schedule_node *node, void *user)
{
  if (!node)
    return NULL;

  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
  {
    int n = isl_schedule_node_band_n_member(node);
    for (int i = 0; i < n; i++) {
      node = isl_schedule_node_band_member_set_sched_pos(node, i, i);
    }
  }

  return node;
}

/* Set up the sched_pos properties.
 */
__isl_give isl_schedule_node *sched_pos_setup(__isl_take isl_schedule_node *node)
{
    node = isl_schedule_node_map_descendant_bottom_up(node,
                                                      &band_sched_pos_setup, NULL);

//#ifdef _DEBUG
//    DBGSCHDNODE(stdout, node, isl_schedule_node_get_ctx(node))    
//#endif
    return node;
}

/* Check if the band is single dimension and the schedule value is a constant.
 * Return the constant value, or -1.
 */
int get_band_single_schedule_val(__isl_keep isl_schedule_node *node)
{
  isl_union_map *umap;
  isl_union_set *domain;
  isl_set *set;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return -1;
  if (isl_schedule_node_band_n_member(node) != 1)
    return -1;
  
  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  domain = isl_schedule_node_get_domain(node);
  umap = isl_union_map_intersect_domain(umap, domain);
  domain = isl_union_map_range(umap);
  set = isl_set_from_union_set(domain);
  if (isl_set_is_singleton(set)) {
    isl_val *val;    
    int ret;
    val = isl_set_plain_get_val_if_fixed(set, isl_dim_set, 0);    
    ret = isl_val_get_num_si(val);    
    isl_set_free(set);
    isl_val_free(val);
    return ret;
  } else {
    isl_set_free(set);
    return -1;
  }
}

/* Compute the prefix schedule of the current node and check if the last 
 * schedule dimension only contains single values. If so, return the value.
 */
int get_last_sched_dim_val(__isl_keep isl_schedule_node *node)
{
  isl_union_map *prefix;
  isl_set *range;

  prefix = isl_schedule_node_get_prefix_schedule_relation(node);
  range = isl_set_from_union_set(isl_union_map_range(prefix));  

  if (isl_set_dim(range, isl_dim_set) > 1)
    range = isl_set_project_out(range, isl_dim_set, 0, isl_set_dim(range, isl_dim_set) - 1);  

  range = isl_set_coalesce(range);
  if (isl_set_is_singleton(range)) {
    isl_val *val;
    int ret;
    val = isl_set_plain_get_val_if_fixed(range, isl_dim_set, 0);
    if (isl_val_is_nan(val)) {
      isl_set_free(range);
      isl_val_free(val);
      return -1;
    }    
    ret = isl_val_get_num_si(val);    
    isl_set_free(range);
    isl_val_free(val);
    return ret;
  } else {
    isl_set_free(range);
    return -1;
  }
}

/* Mark all dimensions in the current band node atomic.
 */
static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node)
{
  return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
}

/* Mark "node" atomic, if it is a band node.
 * Do the same for all ancestors.
 * Return a pointer to "node" (in the updated schedule tree).
 */
__isl_give isl_schedule_node *autosa_atomic_ancestors(
  __isl_take isl_schedule_node *node)
{
  int pos;

  if (!node)
    return NULL;
  if (!isl_schedule_node_has_parent(node))
    return node;

  pos = isl_schedule_node_get_child_position(node);
  node = isl_schedule_node_parent(node);
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    node = atomic(node);
  node = autosa_atomic_ancestors(node);
  node = isl_schedule_node_child(node, pos);

  return node;
}

/* Examines if the current schedule node is a io mark at the level "io_level".
 * Specifically, the io mark at the level "io_level" has the name as "io_L[io_level]".
 */
isl_bool isl_schedule_node_is_io_mark(__isl_keep isl_schedule_node *node, int io_level)
{
  isl_id *mark;
  const char *name;
  isl_printer *p;
  char *io_mark;

  if (!node)
    return isl_bool_error;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
    return isl_bool_false;

  mark = isl_schedule_node_mark_get_id(node);
  if (!mark)
    return isl_bool_error;

  name = isl_id_get_name(mark);
  p = isl_printer_to_str(isl_schedule_node_get_ctx(node));
  p = isl_printer_print_str(p, "io_L");
  p = isl_printer_print_int(p, io_level);
  io_mark = isl_printer_get_str(p);
  p = isl_printer_free(p);
  isl_id_free(mark);
  if (!strcmp(name, io_mark))
  {
    free(io_mark);
    return isl_bool_true;
  }
  else
  {
    free(io_mark);
    return isl_bool_false;
  }
}

/* Examine if the "node" is under the "simd" mark. 
 */
int is_node_under_simd(__isl_keep isl_schedule_node *node)
{
  isl_schedule_node *cur_node;

  cur_node = isl_schedule_node_copy(node);
  while (isl_schedule_node_has_parent(cur_node))
  {
    if (isl_schedule_node_get_type(cur_node) == isl_schedule_node_mark)
    {
      isl_id *id = isl_schedule_node_mark_get_id(cur_node);
      if (!strcmp(isl_id_get_name(id), "simd"))
      {
        isl_id_free(id);
        isl_schedule_node_free(cur_node);
        return 1;
      }
      isl_id_free(id);
    }
    cur_node = isl_schedule_node_parent(cur_node);
  }

  isl_schedule_node_free(cur_node);

  return 0;
}

/* Examine if the "node" is under the "latency" mark. */
int is_node_under_latency(__isl_keep isl_schedule_node *node)
{
  isl_schedule_node *cur_node;

  cur_node = isl_schedule_node_copy(node);
  while (isl_schedule_node_has_parent(cur_node))
  {
    if (isl_schedule_node_get_type(cur_node) == isl_schedule_node_mark)
    {
      isl_id *id = isl_schedule_node_mark_get_id(cur_node);
      if (!strcmp(isl_id_get_name(id), "latency"))
      {
        isl_id_free(id);
        isl_schedule_node_free(cur_node);
        return 1;
      }
      isl_id_free(id);
    }
    cur_node = isl_schedule_node_parent(cur_node);
  }

  isl_schedule_node_free(cur_node);

  return 0;
}

/* Compute a box hull of the time domain of the schedule node, and return the 
 * box dimensions in an array.
 */
int *extract_band_upper_bounds(__isl_keep isl_schedule_node *node)
{
  isl_union_map *umap;
  isl_union_set *uset;
  isl_map *map;  
  isl_set *set;
  int *ubs;
  int n;

  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  uset = isl_schedule_node_get_domain(node);
  umap = isl_union_map_intersect_domain(umap, uset);
  uset = isl_union_map_range(umap);
  set = isl_set_from_union_set(uset);

  n = isl_schedule_node_band_n_member(node);
  ubs = (int *)malloc(n * sizeof(int));
  for (int i = 0; i < n; i++) {
    ubs[i] = compute_set_max(set, i) + 1;
  }
  isl_set_free(set);

  return ubs;
}

/* Return an isl_multi_aff, with as elements the parameters in "space"
 * that have the names specified by the elements in "names".
 * If (some of) these parameters do not already appear in "space",
 * then they are added first.
 */
static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space,
                                                  __isl_keep isl_id_list *names)
{
  int i, n;
  isl_local_space *ls;
  isl_multi_aff *ma;

  if (!names)
    space = isl_space_free(space);

  n = isl_id_list_n_id(names);
  for (i = 0; i < n; ++i)
  {
    int pos;
    isl_id *id;

    id = isl_id_list_get_id(names, i);
    pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
    if (pos >= 0)
    {
      isl_id_free(id);
      continue;
    }
    pos = isl_space_dim(space, isl_dim_param);
    space = isl_space_add_dims(space, isl_dim_param, 1);
    space = isl_space_set_dim_id(space, isl_dim_param, pos, id);
  }
  ma = isl_multi_aff_zero(isl_space_copy(space));
  ls = isl_local_space_from_space(isl_space_domain(space));
  for (i = 0; i < n; ++i)
  {
    int pos;
    isl_id *id;
    isl_aff *aff;

    id = isl_id_list_get_id(names, i);
    pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
    isl_id_free(id);
    aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
                                isl_dim_param, pos);
    ma = isl_multi_aff_set_aff(ma, i, aff);
  }
  isl_local_space_free(ls);

  return ma;
}

/* Return constraints on the domain elements that equate a sequence of
 * parameters called "names", to the partial schedule of "node".
 * The number of members of the band node "node" should be smaller
 * than or equal to the number of elements in "names". 
 * If it is smaller, then the first elements of "names" are equated to zero.
 */
__isl_give isl_union_set *set_schedule_eq(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names)
{
  int n, n_zero;
  isl_multi_union_pw_aff *mupa, *mupa2;
  isl_multi_aff *ma;
  isl_space *space;
  isl_union_set *domain;

  if (!node)
    return NULL;
  n = isl_id_list_n_id(names);
  if (n == 0)
    return isl_schedule_node_get_universe_domain(node);
  n_zero = n - isl_schedule_node_band_n_member(node);

  mupa = isl_schedule_node_band_get_partial_schedule(node);
  space = isl_multi_union_pw_aff_get_space(mupa);
  space = isl_space_params(space);
  space = isl_space_set_from_params(space);
  space = isl_space_add_dims(space, isl_dim_set, n_zero);
  ma = isl_multi_aff_zero(space);

  domain = isl_schedule_node_get_universe_domain(node);
  /* Map the domain elements to "n_zero" zeros. */
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
      isl_union_set_copy(domain), ma);
  /* Build a new mupa that mupa2 -> mupa */
  mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);
  space = isl_multi_union_pw_aff_get_space(mupa);
  ma = parameter_vector(space, names);
  mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
  mupa = isl_multi_union_pw_aff_sub(mupa, mupa2);

  return isl_multi_union_pw_aff_zero_union_set(mupa);
}

__isl_give isl_union_set *set_schedule_neq(
    __isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names)
{
  isl_union_set *uset, *domain;
  isl_union_map *umap;

  if (!node)
    return NULL;
  
  uset = set_schedule_eq(node, names);
  umap = isl_schedule_node_band_get_partial_schedule_union_map(node);
  domain = isl_union_map_domain(umap);
  uset = isl_union_set_subtract(domain, uset);

  return uset;
}

/* Construct schedule constraints from the dependences in prog->scop and
 * the array order dependences in prog->array_order.
 *
 * If live range reordering is allowed, then we need to make sure
 * that live ranges on arrays are not run in parallel since doing
 * so would require array expansion.  We therefore add the array
 * order dependences to the coincidence dependences.  Non-zero array
 * order dependences will then prevent a schedule dimension from being
 * considered parallel.
 * Live ranges derived from scalars are allowed to be run in parallel
 * since we force the scalars to be mapped to private memory in
 * check_scalar_live_ranges.
 * If live range reordering is allowed, then the false dependences
 * are not added to the validity constraints as that would prevent
 * reordering.  Instead, the external false dependences that enforce that reads
 * from potentially live-in data precede any later write and
 * that writes of potentially live-out data follow any other earlier write
 * are added to the validity and the coincidence constraints.
 * The false dependences are still added to the proximity constraints
 * for consistency with the case where live range reordering is not allowed.
 * The coincidence constraints then consist of flow dependences,
 * external false dependences and array order dependences.
 * The independences can be filtered out from the first two sets.
 * They have already been filtered out from the array order dependences
 * on a per array basis in collect_order_dependences.
 * There is no need for a per array handling of the other two sets
 * as there should be no flow or external false dependence on local
 * variables that can be filtered out.
 */
static __isl_give isl_schedule_constraints *construct_schedule_constraints(
    struct autosa_prog *prog)
{
  isl_union_set *domain;
  isl_union_map *dep_raw, *dep;
  isl_union_map *validity, *proximity, *coincidence;
  isl_schedule_constraints *sc;

  domain = isl_union_set_copy(prog->scop->domain);
  sc = isl_schedule_constraints_on_domain(domain);
  sc = isl_schedule_constraints_set_context(sc,
                                            isl_set_copy(prog->scop->context));
  if (prog->scop->options->live_range_reordering)
  {
    sc = isl_schedule_constraints_set_conditional_validity(sc,
                                                           isl_union_map_copy(prog->scop->tagged_dep_flow),
                                                           isl_union_map_copy(prog->scop->tagged_dep_order));
    proximity = isl_union_map_copy(prog->scop->dep_flow);
    validity = isl_union_map_copy(proximity);
    validity = isl_union_map_union(validity,
                                   isl_union_map_copy(prog->scop->dep_forced));
    proximity = isl_union_map_union(proximity,
                                    isl_union_map_copy(prog->scop->dep_false));
    coincidence = isl_union_map_copy(validity);
    coincidence = isl_union_map_subtract(coincidence,
                                         isl_union_map_copy(prog->scop->independence));
    coincidence = isl_union_map_union(coincidence,
                                      isl_union_map_copy(prog->array_order));
    /* Add the RAR into the validity constraints for AutoSA. */
    if (prog->scop->options->autosa->autosa)
    {
      validity = isl_union_map_union(validity,
                                     isl_union_map_copy(prog->scop->dep_rar));
    }
  }
  else
  {
//#ifdef _DEBUG
//    std::cout << "FLOW DEPs" << std::endl;
//    DBGUMAP(stdout, prog->scop->dep_flow, isl_union_map_get_ctx(prog->scop->dep_flow));    
//    std::cout << "FALSE DEPs" << std::endl;
//    DBGUMAP(stdout, prog->scop->dep_false, isl_union_map_get_ctx(prog->scop->dep_false));
//    std::cout << "RAR DEPs" << std::endl;
//    DBGUMAP(stdout, prog->scop->dep_rar, isl_union_map_get_ctx(prog->scop->dep_rar));
//#endif
    dep_raw = isl_union_map_copy(prog->scop->dep_flow);
    dep = isl_union_map_copy(prog->scop->dep_false);
    dep = isl_union_map_union(dep, dep_raw);    
    dep = isl_union_map_coalesce(dep);
    proximity = isl_union_map_copy(dep);
    coincidence = isl_union_map_copy(dep);
    validity = dep;
    /* Add the RAR into the validity constraints for AutoSA. */
    if (prog->scop->options->autosa->autosa)
    {
      validity = isl_union_map_union(validity,
                                     isl_union_map_copy(prog->scop->dep_rar));
    }
  }
  sc = isl_schedule_constraints_set_validity(sc, validity);
  sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
  sc = isl_schedule_constraints_set_proximity(sc, proximity);

  return sc;
}

/* Compute an appropriate schedule based on the accesses in
 * gen->read and gen->write.
 *
 * We derive schedule constraints from the dependences in gen->prog->scop
 * and then use isl to compute a schedule that has a parallel loop
 * in each tilable band.
 * During the schedule construction, some statement instances
 * may be grouped first based on the input schedule.
 */
__isl_give isl_schedule *compute_schedule(struct autosa_gen *gen)
{
  isl_schedule_constraints *sc;
  isl_schedule *schedule;

  sc = construct_schedule_constraints(gen->prog);
  schedule = gen->prog->scop->schedule;
  schedule = ppcg_compute_schedule(sc, schedule, gen->options);

  return schedule;
}

/* If the band node "node" has exactly one member then mark it permutable.
 */
static __isl_give isl_schedule_node *band_set_permutable(
    __isl_take isl_schedule_node *node,
    __isl_keep isl_schedule_constraints *sc)
{
  if (isl_schedule_node_band_n_member(node) == 1)
    node = isl_schedule_node_band_set_permutable(node, 1);

  return node;
}

/* Return the coincidence constraints between pairs of instances
 * that are scheduled together by the ancestors of "node".
 * That is, select those coincidence constraints that relate
 * pairs of instances that have the same value for the prefix schedule.
 * If the schedule depth is zero, then the prefix schedule does not
 * contain any information, so we intersect domain and range
 * of the schedule constraints with the reaching domain elements instead.
 */
static __isl_give isl_union_map *get_local_coincidence(
    __isl_keep isl_schedule_node *node,
    __isl_keep isl_schedule_constraints *sc)
{
  isl_union_map *coincidence;
  isl_multi_union_pw_aff *prefix;
  isl_union_pw_multi_aff *contraction;

  coincidence = isl_schedule_constraints_get_coincidence(sc);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  if (isl_schedule_node_get_schedule_depth(node) == 0)
  {
    isl_union_set *domain;

    domain = isl_schedule_node_get_domain(node);
    domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                       contraction);
    coincidence = isl_union_map_intersect_domain(coincidence,
                                                 isl_union_set_copy(domain));
    coincidence = isl_union_map_intersect_range(coincidence,
                                                domain);
    return coincidence;
  }

  prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
  prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
                                                              contraction);
  return isl_union_map_eq_at_multi_union_pw_aff(coincidence, prefix);
}

/* For each member in the band node "node", determine whether
 * it is coincident with respect to the outer nodes and mark
 * it accordingly.
 *
 * That is, for each coincidence constraint between pairs
 * of instances that are scheduled together by the outer nodes,
 * check that domain and range are assigned the same value
 * by the band member.  This test is performed by checking
 * that imposing the same value for the band member does not
 * remove any elements from the set of coincidence constraints.
 */
static __isl_give isl_schedule_node *band_set_coincident(
    __isl_take isl_schedule_node *node,
    __isl_keep isl_schedule_constraints *sc)
{
  isl_union_map *coincidence;
  isl_union_pw_multi_aff *contraction;
  isl_multi_union_pw_aff *partial;
  int i, n;

  coincidence = get_local_coincidence(node, sc);

  partial = isl_schedule_node_band_get_partial_schedule(node);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
                                                               contraction);
  n = isl_schedule_node_band_n_member(node);
  for (i = 0; i < n; ++i)
  {
    isl_union_map *coincidence_i;
    isl_union_pw_aff *upa;
    isl_multi_union_pw_aff *partial_i;
    int subset;

    upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i);
    partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa);
    coincidence_i = isl_union_map_copy(coincidence);
    coincidence_i = isl_union_map_eq_at_multi_union_pw_aff(
        coincidence_i, partial_i);
    subset = isl_union_map_is_subset(coincidence, coincidence_i);
    isl_union_map_free(coincidence_i);

    if (subset < 0)
      break;
    node = isl_schedule_node_band_member_set_coincident(node, i,
                                                        subset);
  }
  if (i < n)
    node = isl_schedule_node_free(node);
  isl_multi_union_pw_aff_free(partial);
  isl_union_map_free(coincidence);

  return node;
}

/* If "node" is a band, then set its properties.
 *
 * In particular, if the band has exactly one member, then mark it permutable.
 * Mark the band members coincident based on the coincidence constraints
 * of "sc".
 */
static __isl_give isl_schedule_node *set_band_properties(
    __isl_take isl_schedule_node *node, void *user)
{
  isl_schedule_constraints *sc = (isl_schedule_constraints *)user;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return node;
  if (isl_schedule_node_band_n_member(node) == 0)
    return node;

  node = band_set_permutable(node, sc);
  node = band_set_coincident(node, sc);

  return node;
}

/* Return the original schedule with all bands marked permutable and
 * all band members marked coincident based on the coincidence constraints.
 * The bands are explicitly marked permutable so that they will be considered
 * by mark_outer_permutable.
 */
static __isl_give isl_schedule *determine_properties_original_schedule(
    struct autosa_gen *gen)
{
  isl_schedule *schedule;
  isl_schedule_constraints *sc;

  schedule = isl_schedule_copy(gen->prog->scop->schedule);
  sc = construct_schedule_constraints(gen->prog);
  schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
                                                      &set_band_properties, sc);
  isl_schedule_constraints_free(sc);

  return schedule;
}

/* Compute a schedule or determine the properties of the original schedule
 * depending on the value of the "reschedule" option.
 */
static __isl_give isl_schedule *compute_or_set_properties(void *user)
{
  struct autosa_gen *gen = (struct autosa_gen *)user;

  if (gen->options->reschedule)
    return compute_schedule(gen);
  else
    return determine_properties_original_schedule(gen);
}

/* Obtain a schedule for the scop, by reading it from
 * a file, by computing one or by determining the properties
 * of the original schedule. 
 */
__isl_give isl_schedule *get_schedule(struct autosa_gen *gen)
{
  return ppcg_get_schedule(gen->ctx, gen->options,
                           &compute_or_set_properties, gen);
}

/* Since we are merging for the outermost band node, 
 * we will check if for each validity constraint if the domain is lexicographically 
 * less or equal to the range. 
 * Note that this function only considers the outermost node.
 */
static isl_bool is_dep_non_neg_at_node(
  __isl_keep isl_schedule_node *node, __isl_keep isl_schedule_constraints *sc)
{
  if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    return isl_bool_false;
  if (isl_schedule_node_band_n_member(node) == 0)
    return isl_bool_false;

  isl_union_map *validity;
  isl_union_pw_multi_aff *contraction;
  isl_multi_union_pw_aff *partial;
  isl_union_set *domain;
  int i, n;

  validity = isl_schedule_constraints_get_validity(sc);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  domain = isl_schedule_node_get_domain(node);
  domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction);
  validity = isl_union_map_intersect_domain(validity, isl_union_set_copy(domain));
  validity = isl_union_map_intersect_range(validity, domain);
  //DBGUMAP(stdout, validity, isl_schedule_node_get_ctx(node));

  partial = isl_schedule_node_band_get_partial_schedule(node);
  contraction = isl_schedule_node_get_subtree_contraction(node);
  partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
                                                               contraction);
  n = isl_schedule_node_band_n_member(node);
  for (i = 0; i < n; i++)
  {
    isl_union_map *validity_i, *validity_i_eq, *validity_i_lt;
    isl_union_pw_aff *upa;
    isl_multi_union_pw_aff *partial_i;
    int subset;

    upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i);
    partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa);    
    validity_i_eq = isl_union_map_eq_at_multi_union_pw_aff(
      isl_union_map_copy(validity), isl_multi_union_pw_aff_copy(partial_i));
    validity_i_lt = isl_union_map_lex_lt_at_multi_union_pw_aff(
      isl_union_map_copy(validity), partial_i);
    validity_i = isl_union_map_union(validity_i_eq, validity_i_lt);
    subset = isl_union_map_is_subset(validity, validity_i);
    isl_union_map_free(validity_i);

    if (subset <= 0)
      break;    
  }

  isl_multi_union_pw_aff_free(partial);
  isl_union_map_free(validity);

  return (i == n) ? isl_bool_true : isl_bool_false;
}

/* Try to merge the outer bands of the schedule as much as possible as 
 * long as they can form a permutable band.
 * Start from the outermost band, if the dependence distance on the current band 
 * is non-zero, merge it with the parent band node. 
 * This process stops until a non-band node is encoutnered.
 */
__isl_give isl_schedule *merge_outer_bands(__isl_take isl_schedule *schedule, struct autosa_gen *gen)
{
  isl_schedule_node *node;
  isl_schedule_constraints *sc;
  isl_bool is_first_band = isl_bool_true;

  node = isl_schedule_get_root(schedule); // points to the domain node
  isl_schedule_free(schedule);
  sc = construct_schedule_constraints(gen->prog);

  node = isl_schedule_node_child(node, 0); // points to the first band band
  while (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
    /* Examine if all dependence distances at this band are non-negative */    
    isl_bool nneg = is_dep_non_neg_at_node(node, sc);
    //std::cout << nneg << std::endl;
    if (nneg) {
      if (is_first_band)
        is_first_band = isl_bool_false;
      else {
        /* Merge the node with the parent band node. */
        node = isl_schedule_node_parent(node);
        node = autosa_node_merge(node); // TODO: delete the partial schedule space name
      }
    }
    node = isl_schedule_node_child(node, 0);
  }

  /* Set the coincidence. */
  node = isl_schedule_node_parent(node);
  if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
    node = band_set_coincident(node, sc);
  }

  schedule = isl_schedule_node_get_schedule(node);
  isl_schedule_node_free(node);
  isl_schedule_constraints_free(sc);

  return schedule;
}

/* Is "node" a mark node with an identifier called "array"?
 */
static int node_is_array(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "array");
}

/* Is "node" a mark node with an identifier called "anchor"?
 */
static int node_is_anchor(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "anchor");
}

/* Is "node" a mark node with an identifier called "local"?
 */
static int node_is_local(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "local");
}

/* Is "node" a mark node with an identifier called "pe"?
 */
static int node_is_pe(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "pe");
}

/* Is "node" a mark node with an identifier called "kernel"?
 */
static int node_is_kernel(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "kernel");
}

/* Is "node" a mark node with an identifier called "mark"?
 */
static int node_is_mark(__isl_keep isl_schedule_node *node, const char *mark)
{
  return is_marked(node, mark);
}

/* Is "node" a mark node with an identifier called "io_L[x]"?
 */
static int node_is_io_mark(__isl_keep isl_schedule_node *node)
{
  isl_id *mark;
  const char *name;
  int has_name;

  if (!node)
    return -1;

  if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
    return 0;

  mark = isl_schedule_node_mark_get_id(node);
  if (!mark)
    return -1;

  name = isl_id_get_name(mark);
  has_name = strncmp(name, "io_L", strlen("io_L"));

  isl_id_free(mark);

  return has_name;
}

/* Assuming "node" is a filter node, does it correspond to the branch
 * that contains the "array" mark, i.e., does it contain any elements in
 * "core"?
 */
static int node_is_core(__isl_keep isl_schedule_node *node,
                        __isl_keep isl_union_set *core)
{
  int disjoint;
  isl_union_set *filter;

  filter = isl_schedule_node_filter_get_filter(node);
  disjoint = isl_union_set_is_disjoint(filter, core);
  isl_union_set_free(filter);
  if (disjoint < 0)
    return -1;

  return !disjoint;
}

/* Move to the only child of "node" where the branch containing 
 * the domain elements in "core".
 *
 * If "node" is not a sequence, then it only has one child and we move
 * to that single child.
 * Otherwise, we check each of the filters in the children, pick
 * the one that corresponds to "core" and return a pointer to the child
 * of the filter node.
 */
static __isl_give isl_schedule_node *core_child(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
  int i, n;
  
  if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
    return isl_schedule_node_child(node, 0);
  
  n = isl_schedule_node_n_children(node);
  for (i = 0; i < n; ++i)
  {
    int is_core;

    node = isl_schedule_node_child(node, i);
    is_core = node_is_core(node, core);

    if (is_core < 0)
      return isl_schedule_node_free(node);
    if (is_core)
      return isl_schedule_node_child(node, 0);

    node = isl_schedule_node_parent(node);
  }  

  isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
          "core child not found", return isl_schedule_node_free(node));
}

/* Move down from the "kernel" mark (or at least a node with schedule
 * depth smaller than or equal to "depth") to a band node at schedule
 * depth "depth".  The "array" mark is assumed to have a schedule
 * depth greater than or equal to "depth".  The branch containing the
 * "array" mark is identified by the domain elements in "core".
 *
 * If the desired schedule depth is in the middle of band node,
 * then the band node is split into two pieces, the second piece
 * at the desired schedule depth.
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_depth(
    __isl_take isl_schedule_node *node, int depth,
    __isl_keep isl_union_set *core)
{
  int is_local;
  int is_array = 0;

  while (node && isl_schedule_node_get_schedule_depth(node) < depth)
  {
    if (isl_schedule_node_get_type(node) ==
        isl_schedule_node_band)
    {
      int node_depth, node_dim;
      node_depth = isl_schedule_node_get_schedule_depth(node);
      node_dim = isl_schedule_node_band_n_member(node);
      if (node_depth + node_dim > depth)
        node = isl_schedule_node_band_split(node,
                                            depth - node_depth);
    }
    node = core_child(node, core);
  }
  while ((is_local = node_is_local(node)) == 0 &&
         (is_array = node_is_array(node)) == 0 &&
         isl_schedule_node_get_type(node) != isl_schedule_node_band)
    node = core_child(node, core);
  if (is_local < 0 || is_array < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch until the "array" mark is reached,
 * where the branch containing the "array" mark is 
 * identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_array(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
  int is_array;

  while ((is_array = node_is_array(node)) == 0)
    node = core_child(node, core);

  if (is_array < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move up the tree underneath the "array" mark until the "array" mark is reached. 
 */
__isl_give isl_schedule_node *autosa_tree_move_up_to_array(
    __isl_take isl_schedule_node *node)
{
  int is_array;

  while ((is_array = node_is_array(node)) == 0)
    node = isl_schedule_node_parent(node);

  if (is_array < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch between "kernel" and "local" until
 * the "local" mark is reached, where the branch containing the "local"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_local(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
  int is_local;

  while ((is_local = node_is_local(node)) == 0)
    node = core_child(node, core);

  if (is_local < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch until the "kernel" mark is reached. 
 * In AutoSA, only one single kernel is identified, and it lies on the 
 * linear branch below the domain node. Therefore, we can safely
 * traverse down the branch until the "kernel" mark is found.
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_kernel(
    __isl_take isl_schedule_node *node)
{
  int is_kernel;

  while ((is_kernel = node_is_kernel(node)) == 0)
    node = isl_schedule_node_child(node, 0);

  if (is_kernel < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move up the tree underneath the "kernel" mark until
 * the "kernel" mark is reached.
 */
__isl_give isl_schedule_node *autosa_tree_move_up_to_kernel(
    __isl_take isl_schedule_node *node)
{
  int is_kernel;

  while ((is_kernel = autosa_tree_node_is_kernel(node)) == 0)
  {
    node = isl_schedule_node_parent(node);
  }
  if (is_kernel < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch between "kernel" and "pe" until
 * the "pe" mark is reached, where the branch containing the "pe"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_pe(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
  int is_pe;

  while ((is_pe = node_is_pe(node)) == 0)
    node = core_child(node, core);

  if (is_pe < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move up the tree underneath the "array" mark until the "pe" mark is reached. 
 */
__isl_give isl_schedule_node *autosa_tree_move_up_to_pe(
    __isl_take isl_schedule_node *node)
{
  int is_pe;

  while ((is_pe = node_is_pe(node)) == 0)
    node = isl_schedule_node_parent(node);

  if (is_pe < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch between "kernel" and "mark" until
 * the "mark" mark is reached, where the branch containing the "mark"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core, const char *mark)
{
  int is_mark;

  while ((is_mark = node_is_mark(node, mark)) == 0)
    node = core_child(node, core);

  if (is_mark < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move up the tree underneath the "mark" mark until the "mark" mark is reached. 
 */
__isl_give isl_schedule_node *autosa_tree_move_up_to_mark(
    __isl_take isl_schedule_node *node, const char *mark)
{
  int is_mark;

  while ((is_mark = node_is_mark(node, mark)) == 0)
    node = isl_schedule_node_parent(node);

  if (is_mark < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch between "kernel" and "pe" until
 * the first "io_L[x]" mark is reached, where the branch containing the "io_L[x]"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_first_io_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
  int is_io_mark;

  while ((is_io_mark = node_is_io_mark(node)) == 0)
    node = core_child(node, core);

  if (is_io_mark < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Move down the branch between "kernel" and "pe" until
 * the "io_L[io_level]" mark is reached, where the branch containing the io
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *autosa_tree_move_down_to_io_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core, int io_level)
{
  int is_mark;
  isl_printer *p;
  char *mark;  

  p = isl_printer_to_str(isl_schedule_node_get_ctx(node));
  p = isl_printer_print_str(p, "io_L");
  p = isl_printer_print_int(p, io_level);
  mark = isl_printer_get_str(p);
  p = isl_printer_free(p);


  while ((is_mark = node_is_mark(node, mark)) == 0) {
    if (!isl_schedule_node_has_children(node))
      break;
    node = core_child(node, core);
  }

  if (is_mark <= 0)
    node = isl_schedule_node_free(node);  
  free(mark);

  return node;
}

/* Move up the tree underneath the "anchor" mark until the "anchor" mark is reached. 
 */
__isl_give isl_schedule_node *autosa_tree_move_up_to_anchor(
    __isl_take isl_schedule_node *node)
{
  int is_anchor;

  while ((is_anchor = node_is_anchor(node)) == 0)
    node = isl_schedule_node_parent(node);

  if (is_anchor < 0)
    node = isl_schedule_node_free(node);

  return node;
}

/* Is "node" a mark node with an identifier called "kernel"?
 */
int autosa_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
{
  return is_marked(node, "kernel");
}

/* Is "node" a mark node with an identifier called "mark"?
 */
int autosa_tree_node_is_mark(__isl_keep isl_schedule_node *node, const char *mark)
{
  if (mark == NULL)
    return (isl_schedule_node_get_type(node) == isl_schedule_node_mark);

  return is_marked(node, mark);
}

/* Insert a mark node with identifier "local" in front of "node".
 */
static __isl_give isl_schedule_node *insert_local(
    __isl_take isl_schedule_node *node)
{
  isl_ctx *ctx;
  isl_id *id;

  ctx = isl_schedule_node_get_ctx(node);
  id = isl_id_alloc(ctx, "local", NULL);
  node = isl_schedule_node_insert_mark(node, id);

  return node;
}

/* Insert a "local" mark in front of the "array" mark 
 * provided the linear branch between "node" and the "array" mark
 * does not contain such a "local" mark already.
 *
 * As a side effect, this function checks that the subtree at "node"
 * actually contains a "array" mark and that there is no branching
 * in between "node" and this "array" mark.
 * The new node at the original position of "node" is returned.
 */
__isl_give isl_schedule_node *autosa_tree_insert_local_before_array(
    __isl_take isl_schedule_node *node)
{
  int depth0, depth;
  int any_local = 0;

  if (!node)
    return NULL;

  depth0 = isl_schedule_node_get_tree_depth(node);

  for (;;)
  {
    int is_array;
    int n;

    if (!any_local)
    {
      any_local = node_is_local(node);
      if (any_local < 0)
        return isl_schedule_node_free(node);
    }
    is_array = node_is_array(node);
    if (is_array < 0)
      return isl_schedule_node_free(node);
    if (is_array)
      break;
    n = isl_schedule_node_n_children(node);
    if (n == 0)
      isl_die(isl_schedule_node_get_ctx(node),
              isl_error_invalid,
              "no array marker found",
              return isl_schedule_node_free(node));
    if (n > 1)
      isl_die(isl_schedule_node_get_ctx(node),
              isl_error_invalid,
              "expecting single array marker",
              return isl_schedule_node_free(node));

    node = isl_schedule_node_child(node, 0);
  }

  if (!any_local)
    node = insert_local(node);
  depth = isl_schedule_node_get_tree_depth(node);
  node = isl_schedule_node_ancestor(node, depth - depth0);

  return node;
}


================================================
FILE: src/autosa_schedule_tree.h
================================================
#ifndef _AUTOSA_SCHEDULE_TREE_H
#define _AUTOSA_SCHEDULE_TREE_H

#include <isl/schedule_node.h>

int autosa_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
int autosa_tree_node_is_mark(__isl_keep isl_schedule_node *node, const char *mark);
isl_bool isl_schedule_node_is_io_mark(__isl_keep isl_schedule_node *node, int io_level);

__isl_give isl_schedule_node *autosa_tree_move_down_to_depth(
    __isl_take isl_schedule_node *node, int depth,
    __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *autosa_tree_move_down_to_array(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *autosa_tree_move_up_to_array(
    __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_tree_move_down_to_local(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *autosa_tree_move_down_to_kernel(
    __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_tree_move_up_to_kernel(
    __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_tree_move_down_to_pe(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *autosa_tree_move_up_to_pe(
    __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *autosa_tree_move_down_to_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core, const char *mark);
__isl_give isl_schedule_node *autosa_tree_move_up_to_mark(
    __isl_take isl_schedule_node *node, const char *mark);
__isl_give isl_schedule_node *autosa_tree_move_down_to_first_io_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *autosa_tree_move_down_to_io_mark(
    __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core, int io_level);
__isl_give isl_schedule_node *autosa_tree_move_up_to_anchor(
    __isl_take isl_schedule_node *node);

__isl_give isl_schedule_node *autosa_tree_insert_local_before_array(
    __isl_take isl_schedule_node *node);

#endif


================================================
FILE: src/autosa_t2s.cpp
================================================


================================================
FILE: src/autosa_tapa_cpp.cpp
================================================
#include <isl/ctx.h>

#include "autosa_tapa_cpp.h"
#include "autosa_common.h"
#include "autosa_comm.h"
#include "autosa_print.h"
#include "autosa_trans.h"
#include "autosa_codegen.h"
#include "autosa_utils.h"

#include <set>

struct print_host_user_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_top_module *top;
};

struct print_hw_module_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_module *module;
  /* Used for double buffer codegen. Modify the printed iterator prefix. */
  const char *iterator_prefix;
};

/* Print the includes for TAPA host.
 */
static void print_tapa_host_header(FILE *fp)
{
  fprintf(fp, "#include <tapa.h>\n");
  fprintf(fp, "using tapa::aligned_allocator;");
}

/* Open the host .cpp file and the kernel .h and .cpp files for writing.
 * Add the necessary includes.
 */
static void hls_open_files(struct hls_info *info, const char *input)
{
  char name[PATH_MAX];
  char dir[PATH_MAX];
  int len, len_dir;
  isl_printer *p_str;
  char *file_path;

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/");
  file_path = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  len = ppcg_extract_base_name(name, input);
  /* Add the prefix */
  sprintf(dir, "%s", file_path);
  len_dir = strlen(file_path);

  strcpy(name + len, "_host.cpp");
  strcpy(dir + len_dir, name);
  info->host_c = fopen(dir, "w");
  if (!info->host_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_host.h");
  strcpy(dir + len_dir, name);
  info->host_h = fopen(dir, "w");

  fprintf(info->host_h, "template <typename T1, typename T2> "                                                                                                                                                                              
          "inline T1 min(T1 x, T2 y) { return (x < T1(y)) ? x : T1(y); }\n");
  fprintf(info->host_h, "template <typename T1, typename T2> "
          "inline T1 max(T1 x, T2 y) { return (x > T1(y)) ? x : T1(y); }\n");
  fprintf(info->host_h, "\n");
  print_tapa_host_header(info->host_h);
  fprintf(info->host_c, "#include \"%s\"\n", name);

  strcpy(name + len, "_kernel_modules.cpp");
  strcpy(dir + len_dir, name);
  info->kernel_c = fopen(dir, "w");
  if (!info->kernel_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_kernel.h");
  strcpy(dir + len_dir, name);
  info->kernel_h = fopen(dir, "w");
  if (!info->kernel_h)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  fprintf(info->host_c, "#include <assert.h>\n");
  fprintf(info->host_c, "#include <stdio.h>\n");
  fprintf(info->host_c, "#include \"%s\"\n\n", name);
  fprintf(info->kernel_c, "#include \"%s\"\n", name);

  strcpy(name + len, "_top_gen.cpp");
  strcpy(dir + len_dir, name);
  info->top_gen_c = fopen(dir, "w");

  strcpy(name + len, "_top_gen.h");
  strcpy(dir + len_dir, name);
  info->top_gen_h = fopen(dir, "w");

  fprintf(info->top_gen_c, "#include <isl/printer.h>\n");
  fprintf(info->top_gen_c, "#include \"%s\"\n", name);

  fprintf(info->kernel_h, "#include <tapa.h>\n");
  fprintf(info->kernel_h, "#include <ap_int.h>\n");
  fprintf(info->kernel_h, "\n");

  fprintf(info->kernel_c, "template <typename T1, typename T2> "
          "inline T1 min(T1 x, T2 y) { return (x < T1(y)) ? x : T1(y); }\n");
  fprintf(info->kernel_c, "template <typename T1, typename T2> "
          "inline T1 max(T1 x, T2 y) { return (x > T1(y)) ? x : T1(y); }\n");
  fprintf(info->kernel_c, "\n");

  free(file_path);
}

/* Close all output files.
 */
static void hls_close_files(struct hls_info *info)
{
  isl_printer *p_str;
  char *complete;
  FILE *f;

  fclose(info->kernel_c);
  fclose(info->kernel_h);
  fclose(info->host_c);
  fclose(info->host_h);
  fclose(info->top_gen_c);
  fclose(info->top_gen_h);

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/completed");
  complete = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  f = fopen(complete, "w");
  fclose(f);
  free(complete);
}

/* Extract the data pack factors for each I/O buffer allocated for the current
 * I/O group.
 * Only insert the data pack factor that is not found in the current list
 * "data_pack_factors".
 * The list is in ascending order.
 */
static int *extract_data_pack_factors(int *data_pack_factors,
                                      int *n_factor, struct autosa_array_ref_group *group)
{
  /* Test if the group default packing factor needs to be inserted */
  if (group->n_lane > 1)
  {
    int n_lane = group->n_lane;
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (insert) {
      *n_factor = *n_factor + 1;
      data_pack_factors = (int *)realloc(data_pack_factors,
                                         sizeof(int) * (*n_factor));
      for (int j = *n_factor - 1; j > pos; j--)
      {
        data_pack_factors[j] = data_pack_factors[j - 1];
      }
      data_pack_factors[pos] = n_lane;
    }
  }

  for (int i = 0; i < group->n_io_buffer; i++)
  {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (buf->n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (buf->n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (buf->n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (!insert)
      continue;

    *n_factor = *n_factor + 1;
    data_pack_factors = (int *)realloc(data_pack_factors,
                                       sizeof(int) * (*n_factor));
    for (int j = *n_factor - 1; j > pos; j--)
    {
      data_pack_factors[j] = data_pack_factors[j - 1];
    }
    data_pack_factors[pos] = buf->n_lane;
  }

  return data_pack_factors;
}

/* Examine the local buffers of each array group.
 * Extract the data pack factors and build the data types
 * required by the program.
 */
static isl_stat print_data_types_tapa(
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_printer *p;
  struct autosa_kernel *kernel;

  kernel = top->kernel;
  p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "/* Data Type */");

  /* Print the primitive data type. */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "typedef ");
    p = isl_printer_print_str(p, local->array->type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, local->array->name);
    p = isl_printer_print_str(p, "_t1;");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    int *data_pack_factors = (int *)malloc(sizeof(int));
    int n_factor = 1;
    /* First insert the default data pack factor for the array. */
    data_pack_factors[0] = local->n_lane;

    /* IO group */
    for (int n = 0; n < local->n_io_group; n++)
    {
      struct autosa_array_ref_group *group = local->io_groups[n];
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, group);
    }
    /* Drain group */
    if (local->drain_group)
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, local->drain_group);

    if (local->is_sparse) {
      std::set<int> tmp_lanes;
      for (int n = 0; n < n_factor; n++) {
        tmp_lanes.insert(data_pack_factors[n] * kernel->n_nzero);
        tmp_lanes.insert(data_pack_factors[n]);
      }
      for (auto it = tmp_lanes.begin(); it != tmp_lanes.end(); ++it) {
        int f = *it;
        if (local->array->size * 8 * f > 1024) {
          printf("[AutoSA] Warning: The data width %d is greater than 1024-bit. The type definition is not generated.\n", local->array->size * 8 * f);
          continue;
        }
        if (f > 1) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "typedef vec_t<");
          p = isl_printer_print_str(p, local->array->type);
          p = isl_printer_print_str(p, ", ");
          p = isl_printer_print_int(p, f);
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, f);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }

      for (int n = 0; n < n_factor; n++) {
        if (data_pack_factors[n] * kernel->n_nzero * local->array->size * 8 > 1024)
          continue;
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "typedef struct ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, " {");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, local->array->type);
        } else {
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n] * kernel->n_nzero);
        }
        p = isl_printer_print_str(p, " d;");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, "unsigned char");
        } else {
          p = isl_printer_print_str(p, "tapa::vec_t<");
          p = isl_printer_print_int(p, 8 * data_pack_factors[n]);
          p = isl_printer_print_str(p, ">");
        }
        p = isl_printer_print_str(p, " i;");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "} ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    } else {
      for (int n = 0; n < n_factor; n++)
      {
        if (data_pack_factors[n] != 1)
        {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "typedef tapa::vec_t<");
          p = isl_printer_print_str(p, local->array->type);
          p = isl_printer_print_str(p, ", ");
          p = isl_printer_print_int(p, data_pack_factors[n]);
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n]);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }
    }
    free(data_pack_factors);
  }
  p = print_str_new_line(p, "/* Data Type */");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *declare_and_allocate_arrays(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_kernel *kernel, struct autosa_hw_top_module *top)
{
  p = print_str_new_line(p, "// Allocate memory in host memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1)
    {
      /* Create multiple host buffers. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">>> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".push_back(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (local_array->host_serialize) {
        /* Allocate additional serialize buffer. */
        /* Create multiple host buffers. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">>> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);

        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp");
        p = isl_printer_print_str(p, "(");
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, ".push_back(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }
    else
    {
      /* Create a single host buffer. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        /* Create a single host buffer. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", tapa::aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  /* Initialize buffer. */
  p = print_str_new_line(p, "// Initialize host buffers");

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "[i]");
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else if (local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);
    }
  }

  /* Perform data serialization if needed. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);  // TODO: add hbm support later.
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      } else
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Explicitly create TAPA mmap objects");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "std::vector<");
    if (local_array->array->copy_in) {
      if (local_array->array->copy_out)
        p = isl_printer_print_str(p, "tapa::read_write_mmap<");
      else
        p = isl_printer_print_str(p, "tapa::read_only_mmap<");
    } else if (local_array->array->copy_out)
      p = isl_printer_print_str(p, "tapa::write_only_mmap<");
    else
      p = isl_printer_print_str(p, "tapa::placeholder_mmap<");
    p = isl_printer_print_str(p, local_array->array->type);
    p = isl_printer_print_str(p, ">> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  p = print_str_new_line(p, "// Set the direction of the TAPA mmap objects");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    //for (int j = 0; j < local_array->n_mem_ports; j++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    if (local_array->array->copy_in && local_array->array->copy_out) {
      p = isl_printer_print_str(p, "tapa::read_write_mmap<");
    } else {
      if (local_array->array->copy_in)
        p = isl_printer_print_str(p, "tapa::read_only_mmap<");
      else if (local_array->array->copy_out)
        p = isl_printer_print_str(p, "tapa::write_only_mmap<");
    }
    p = isl_printer_print_str(p, local_array->array->type);
    p = isl_printer_print_str(p, "> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp(");
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    if (local_array->n_mem_ports > 1) {
      p = isl_printer_print_str(p, "[i]");
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ".push_back(std::move(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp));");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  p = isl_printer_end_line(p);

  return p;
}


/* Print code for initializing the device for execution of the transformed
 * code. This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device_tapa(__isl_take isl_printer *p,
                                                struct autosa_prog *prog,
                                                struct autosa_kernel *kernel,
                                                int hls,
                                                struct autosa_hw_top_module *top)
{
  p = autosa_print_local_declarations(p, prog);
  p = declare_and_allocate_arrays(p, prog, kernel, top);

  return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device_tapa(__isl_take isl_printer *p,
                                                   struct autosa_prog *prog,
                                                   struct autosa_kernel *kernel,
                                                   int hls,
                                                   struct autosa_hw_top_module *top)
{
  /* Deserialize the buffer data if necessary. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && !module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "host_deserialize_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "(");
      p = print_host_serialize_arguments(p, top->kernel, group, module, 0, 0);  // TODO: add hbm support later.
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);
    }
  }

  /* Restore buffer */
  p = print_str_new_line(p, "// Restore data from host buffers");
  for (int i = 0; i < prog->n_array; i++)
  {
    struct autosa_array_info *array = &prog->array[i];
    if (!autosa_array_requires_device_allocation(array))
      continue;

    if (array->copy_out)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(dev_");
      p = isl_printer_print_str(p, array->name);
      if (array->local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      if (array->local_array->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, "[0]");
      }
      p = isl_printer_print_str(p, ".begin(), dev_");
      p = isl_printer_print_str(p, array->name);
      if (array->local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      if (array->local_array->n_mem_ports > 1)
      {
        p = isl_printer_print_str(p, "[0]");
      }
      p = isl_printer_print_str(p, ".end(), reinterpret_cast<");
      p = isl_printer_print_str(p, array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

static __isl_give isl_printer *drain_merge_tapa(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_drain_merge_func *func,
    int hls)
{
  struct autosa_array_ref_group *group = func->group;
  p = print_str_new_line(p, "// Merge results");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int idx = ");
  p = isl_printer_print_int(p, group->mem_port_id);
  p = isl_printer_print_str(p, "; idx < ");
  p = isl_printer_print_int(p, group->mem_port_id + group->n_mem_ports);
  p = isl_printer_print_str(p, "; idx++) {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = autosa_array_ref_group_print_prefix(group, p);
  p = isl_printer_print_str(p, "_drain_merge(");
  p = print_drain_merge_arguments(p, func->kernel, group, func, 0, hls);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);
  return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the autosa_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node_tapa(__isl_take isl_printer *p,
                                                        __isl_keep isl_ast_node *node,
                                                        struct autosa_prog *prog,
                                                        int hls,
                                                        struct autosa_hw_top_module *top)
{
  isl_ast_expr *expr, *arg;
  isl_id *id;
  const char *name;
  struct autosa_array_info *array;
  struct autosa_kernel *kernel;
  struct autosa_drain_merge_func *func;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  if (!strcmp(name, "init_device") || !strcmp(name, "clear_device"))
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
  else if (!strcmp(name, "drain_merge"))
    func = (struct autosa_drain_merge_func *)isl_id_get_user(id);
  else
    array = (struct autosa_array_info *)isl_id_get_user(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  isl_ast_expr_free(expr);

  if (!name)
    return isl_printer_free(p);
  if (!strcmp(name, "init_device"))
    return init_device_tapa(p, prog, kernel, hls, top);
  if (!strcmp(name, "clear_device"))
    return clear_device_tapa(p, prog, kernel, hls, top);
  if (!strcmp(name, "drain_merge"))
    return drain_merge_tapa(p, prog, func, hls);
  if (!array)
    return isl_printer_free(p);

  return p;
}

/* Print the header of the given kernel to both gen->hls.kernel_h
 * and gen->hls.kernel_c.
 */
static void print_kernel_headers_tapa(struct autosa_prog *prog,
                                        struct autosa_kernel *kernel, struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_kernel_header(p, prog, kernel, hls, 1);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);

  isl_printer_free(p);
}

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user_tapa(__isl_take isl_printer *p,
                                                      __isl_take isl_ast_print_options *print_options,
                                                      __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int is_user;
  struct autosa_kernel *kernel;
  struct autosa_kernel_stmt *stmt;
  struct print_host_user_data *data;
  struct hls_info *hls;
  struct autosa_hw_top_module *top;

  isl_ast_print_options_free(print_options);

  data = (struct print_host_user_data *)user;
  hls = data->hls;
  top = data->top;

  id = isl_ast_node_get_annotation(node);
  if (!id)
  {
    return print_device_node_tapa(p, node, data->prog, hls->hls, top);
  }

  is_user = !strcmp(isl_id_get_name(id), "user");
  kernel = is_user ? NULL : (struct autosa_kernel *)isl_id_get_user(id);
  stmt = is_user ? (struct autosa_kernel_stmt *)isl_id_get_user(id) : NULL;
  isl_id_free(id);

  if (is_user)
    return autosa_kernel_print_domain(p, stmt);

  p = print_str_new_line(p, "// Launch the kernel");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "tapa::task().invoke(kernel0, ");
  p = print_kernel_arguments(p, data->prog, kernel, 0, hls);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  /* Print the top kernel header. */
  print_kernel_headers_tapa(data->prog, kernel, data->hls);

  return p;
}

/* Print the header of the given module.
 */
static __isl_give isl_printer *print_module_header_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary)
{
  int n = isl_id_list_n_id(module->inst_ids);;
  int first = 1;

  if (n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    /* Print the index template */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");
    for (int i = 0; i < n; i++) {
      if (!first)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);
      first = 0;
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  p = isl_printer_print_str(p, "(");
  p = print_module_arguments(p, prog, module->kernel, module, 1, TAPA_HW, inter, -1, boundary, 0);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* Print the header of the given module to both gen->hls.kernel_h
 * and gen->hls.kernel_c
 * If "inter" is -1, this is a normal module call.
 * If "inter" is 0, this is a intra_trans module call.
 * If "inter" is 1, this is a inter_trans module call.
 */
static isl_stat print_module_headers_tapa(
    struct autosa_prog *prog, struct autosa_hw_module *module,
    struct hls_info *hls, int inter, int boundary)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_header_tapa(p, prog, module, inter, boundary);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_header_tapa(p, prog, module, inter, boundary);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print out variable declarations
 * The local variable can be mapped to different memory resources:
 * FF, LUTRAM, BRAM, URAM.
 */
static __isl_give isl_printer *print_module_var_tapa(
    __isl_take isl_printer *p,
    struct autosa_kernel_var *var, int double_buffer,
    struct autosa_hw_module *module)
{
  int j;
  int use_memory = 0; // 0: FF 1: LUTRAM 2: BRAM 3: URAM
  use_memory = extract_memory_type(module, var, module->options->autosa->uram);

  p = isl_printer_start_line(p);
  if (var->array->local_array->is_sparse && module->type != PE_MODULE) {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, var->n_lane);
  } else {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_t");
    p = isl_printer_print_int(p, var->n_lane);
  }
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, var->name);
  if (double_buffer)
    p = isl_printer_print_str(p, "_ping");
  for (j = 0; j < isl_vec_size(var->size); ++j)
  {
    isl_val *v;

    p = isl_printer_print_str(p, "[");
    v = isl_vec_get_element_val(var->size, j);
    p = isl_printer_print_val(p, v);
    isl_val_free(v);
    p = isl_printer_print_str(p, "]");
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  if (use_memory && var->n_part != 1)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    p = isl_printer_print_str(p, " dim=");
    p = isl_printer_print_int(p, isl_vec_size(var->size));
    p = isl_printer_print_str(p, " factor=");
    p = isl_printer_print_int(p, var->n_part);
    p = isl_printer_print_str(p, " cyclic");
    p = isl_printer_end_line(p);
  } else if (use_memory == 0) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    p = isl_printer_print_str(p, " dim=0 complete");
    p = isl_printer_end_line(p);
  }

  if (use_memory)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS RESOURCE variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    if (module->type == IO_MODULE && module->data_pack_inter == module->data_pack_intra)
      p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_1P_LUTRAM" : (use_memory == 2 ? " core=RAM_1P_BRAM" : " core=RAM_1P_URAM"));
    else
      p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_2P_LUTRAM" : (use_memory == 2 ? " core=RAM_2P_BRAM" : " core=RAM_2P_URAM"));
    p = isl_printer_end_line(p);

    if (var->array->local_array->is_sparse) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS DATA_PACK variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_ping");
      p = isl_printer_end_line(p);
    }
  }

  /* Print pong buffer */
  if (double_buffer)
  {
    p = isl_printer_start_line(p);
    if (var->array->local_array->is_sparse) {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_s_t");
      p = isl_printer_print_int(p, var->n_lane);
    } else {
      if (var->n_lane == 1)
        p = isl_printer_print_str(p, var->array->type);
      else {
        p = isl_printer_print_str(p, var->array->name);
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, var->n_lane);
      }
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_pong");
    for (j = 0; j < isl_vec_size(var->size); ++j)
    {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
    if (use_memory && var->n_part != 1)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_pong");
      p = isl_printer_print_str(p, " dim=");
      p = isl_printer_print_int(p, isl_vec_size(var->size));
      p = isl_printer_print_str(p, " factor=");
      p = isl_printer_print_int(p, var->n_part);
      p = isl_printer_print_str(p, " cyclic");
      p = isl_printer_end_line(p);
    } else if (use_memory == 0) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_pong");
      p = isl_printer_print_str(p, " dim=0 complete");
      p = isl_printer_end_line(p);
    }

    if (use_memory)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS RESOURCE variable=");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "_pong");
      if (module->type == IO_MODULE && module->data_pack_inter == module->data_pack_intra)
        p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_1P_LUTRAM" : (use_memory == 2 ? " core=RAM_1P_BRAM" : " core=RAM_1P_URAM"));
      else
        p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_2P_LUTRAM" : (use_memory == 2 ? " core=RAM_2P_BRAM" : " core=RAM_2P_URAM"));
      p = isl_printer_end_line(p);

      if (var->array->local_array->is_sparse) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS DATA_PACK variable=");
        p = isl_printer_print_str(p, var->name);
        p = isl_printer_print_str(p, "_pong");
        p = isl_printer_end_line(p);
      }
    }
  }

  return p;
}

static __isl_give isl_printer *print_module_vars_tapa(__isl_take isl_printer *p,
                                                        struct autosa_hw_module *module, int inter)
{
  int i, n;
  isl_space *space;
  const char *type;

  if (inter == -1)
  {
    for (i = 0; i < module->n_var; ++i)
      p = print_module_var_tapa(p, &module->var[i], module->double_buffer, module);
  }

  if (module->double_buffer && inter == -1)
  {
    type = isl_options_get_ast_iterator_type(module->kernel->ctx);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "bool arb = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
    p = isl_printer_end_line(p);
    /* iterators */
    space = (module->in) ? module->intra_space : module->inter_space;
    n = isl_space_dim(space, isl_dim_set);
    for (int i = 0; i < n; i++)
    {
      const char *name;
      name = isl_space_get_dim_name(space, isl_dim_set, i);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, "_prev");
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

static __isl_give isl_printer *print_for_with_pipeline(
    __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma HLS PIPELINE II=1");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_with_unroll(
    __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma HLS UNROLL");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_tapa(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int pipeline;
  int unroll;

  pipeline = 0;
  unroll = 0;
  id = isl_ast_node_get_annotation(node);

  if (id)
  {
    struct autosa_ast_node_userinfo *info;

    info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
    if (info && info->is_pipeline)
      pipeline = 1;
    if (info && info->is_unroll)
      unroll = 1;
  }

  if (pipeline)
    p = print_for_with_pipeline(node, p, print_options);
  else if (unroll)
    p = print_for_with_unroll(node, p, print_options);
  else
    p = isl_ast_node_for_print(node, p, print_options);

  isl_id_free(id);

  return p;
}

/* Print the intra_trans module.
 */
static __isl_give isl_printer *autosa_print_intra_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (!module->intra_tree)
    return p;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_tapa(prog, module, hls, 0, boundary);
  fprintf(hls->kernel_c, " {\n");
  /* If double buffer is disabled, the module is then inlined to reduce the
   * overheads.
   * Double buffer module can't inlined, this might cause deadlocks.
   */
  if (module->double_buffer)
    fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
  else
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_module_vars_tapa(p, module, 0);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!intra_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }
  /* For local reduce, print the buffer initialization. */
  for (int i = 0; i < module->n_var; i++) {
    if (module->var[i].init_required) {
      p = autosa_print_var_initialization(p, &module->var[i], hls->target);
    }
  }
  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_tapa, &hw_data);

  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

/* Print the inter_trans module.
 */
static __isl_give isl_printer *autosa_print_inter_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (boundary) {
    if (!module->boundary_inter_tree)
      return p;
  } else {
    if (!module->inter_tree)
      return p;
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_tapa(prog, module, hls, 1, boundary);
  fprintf(hls->kernel_c, " {\n");
  if (module->double_buffer)
    fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
  else
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_module_vars_tapa(p, module, 1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!inter_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_tapa, &hw_data);

  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

static __isl_give isl_printer *print_module_core_header_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary, int serialize, int types)
{
  int n = isl_id_list_n_id(module->inst_ids);
  if (types && n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    /* Print the template */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  if (types)
    p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  if (serialize)
    p = isl_printer_print_str(p, "_serialize");
  if (!types && n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_print_str(p, "<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
  }
  p = isl_printer_print_str(p, "(");
  if (!types) {
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);
  }
  p = print_module_arguments(p, prog, module->kernel, module, types,
                             TAPA_HW, inter, -1, boundary, serialize);
  p = isl_printer_print_str(p, ")");
  if (!types) {
    p = isl_printer_indent(p, -2);
  }

  return p;
}

static __isl_give isl_printer *print_module_core_headers_tapa(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_hw_module *module, struct hls_info *hls,
    int inter, int boundary, int serialize, int types)
{
  p = print_module_core_header_tapa(p, prog, module, inter, boundary, serialize, types);

  return p;
}

static __isl_give isl_printer *print_module_wrapper_header_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary)
{
  int n = isl_id_list_n_id(module->inst_ids);
  if (n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  p = isl_printer_print_str(p, "_wrapper");
  p = isl_printer_print_str(p, "(");
  p = print_module_arguments(p, prog, module->kernel, module, 1,
                             TAPA_HW, inter, -1, boundary, 0);
  p = isl_printer_print_str(p, ")");

  return p;
}

static isl_stat print_module_wrapper_headers_tapa(
    struct autosa_prog *prog, struct autosa_hw_module *module,
    struct hls_info *hls, int inter, int boundary)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_wrapper_header_tapa(p, prog, module, inter, boundary);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_wrapper_header_tapa(p, prog, module, inter, boundary);
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print the serializaztion module that connects the external memory to the
 * top-level I/O module.
 */
static __isl_give isl_printer *autosa_print_serialize_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = print_module_core_headers_tapa(p, prog, module, hls, -1, boundary, 1, 1); // TODO
  fprintf(hls->kernel_c, " {\n");
  fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  p = print_module_serialize_body(p, module, hls);
  p = isl_printer_indent(p, -2);
  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);
  return p;
}

/* Print the default module.
 * For PE modules, we will print a wrapper function to speedup the HLS
 * synthesis.
 * For the rest of the modules, wrapper is disabled.
 */
static __isl_give isl_printer *autosa_print_default_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;
  } else {
    if (!module->boundary_tree)
      return p;
  }

  bool wrapper = 0;
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print wrapper for PE and L1 IO module */
  if (module->type == PE_MODULE || (module->type != PE_MODULE && module->level == 1))
    wrapper = 1;

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = print_module_core_headers_tapa(p, prog, module, hls, -1, boundary, 0, 1);
  fprintf(hls->kernel_c, " {\n");
  if (!boundary || !wrapper)
    fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
  else
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  if (prog->scop->options->autosa->block_sparse) {
    for (int i = 0; i < module->n_io_group; i++) {
      struct autosa_array_ref_group *group = module->io_groups[i];
      if (group->local_array->array_type == AUTOSA_EXT_ARRAY) {
        int n_lane = get_io_group_n_lane(module, NULL, group);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, group->array->name);
        if (group->local_array->is_sparse)
          p = isl_printer_print_str(p, "_s_t");
        else
          p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, n_lane);
        p = isl_printer_print_str(p, " fifo_data_");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = print_module_vars_tapa(p, module, -1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->credit && !module->in)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "credit.write(1);");
    p = isl_printer_end_line(p);
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_tapa, &hw_data);

  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);

  if (module->credit && module->in)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int token = credit.read();");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  if (wrapper) {
    /* Print wrapper. */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "/* Module Definition */");
    p = isl_printer_end_line(p);

    print_module_wrapper_headers_tapa(prog, module, hls, -1, boundary);

    fprintf(hls->kernel_c, " {\n");
    p = isl_printer_indent(p, 2);

    p = print_module_core_headers_tapa(p, prog, module, hls, -1, boundary, 0, 0);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    fprintf(hls->kernel_c, "}\n");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "/* Module Definition */");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);
  }

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_header_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module, int types)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  if (types)
    p = isl_printer_print_str(p, "void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in" : "_out");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, types, TAPA_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_headers_tapa(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_pe_dummy_module *module, struct hls_info *hls, int types)
{
  p = print_pe_dummy_module_core_header_tapa(p, prog, module, types);

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_wrapper_header_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in": "_out");
  p = isl_printer_print_str(p, "_wrapper");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, 1, TAPA_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

static isl_stat print_pe_dummy_module_wrapper_headers_tapa(
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module,
    struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_pe_dummy_module_wrapper_header_tapa(p, prog, module);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_pe_dummy_module_wrapper_header_tapa(p, prog, module);
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *autosa_print_default_pe_dummy_module(
    __isl_take isl_printer *p,
    struct autosa_pe_dummy_module *pe_dummy_module,
    struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  /* For dummy module, we disable wrapper by default due to the relatively
   * high overheads.
   */
  bool wrapper = 0;
  struct autosa_hw_module *module = pe_dummy_module->module;
  struct print_hw_module_data hw_data = {hls, prog, module};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = print_pe_dummy_module_core_headers_tapa(p, prog,
                                              pe_dummy_module, hls, 1);

  fprintf(hls->kernel_c, " {\n");
  if (wrapper)
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");

  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_str_new_line(p, "/* Variable Declaration */");

  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_for_tapa, &hw_data);

  p = isl_ast_node_print(pe_dummy_module->device_tree, p, print_options);

  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  /* Print wrapper. */
  if (wrapper) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "/* Module Definition */");
    p = isl_printer_end_line(p);

    print_pe_dummy_module_wrapper_headers_tapa(prog, pe_dummy_module, hls);

    fprintf(hls->kernel_c, " {\n");
    p = isl_printer_indent(p, 2);
    p = print_pe_dummy_module_core_headers_tapa(p, prog, pe_dummy_module, hls, 0);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, -2);
    fprintf(hls->kernel_c, "}\n");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "/* Module Definition */");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);
  }

  return p;
}

struct print_db_module_while_data {
  int inter; // -1: outer 0: intra 1: inter
  int under_if;
  int reach_user;

  isl_printer *p_for;
  isl_printer *p_user;
  /* Outer */
  std::vector<char *> outer_for_logic;
  std::vector<char *> outer_iterator_name;
  std::vector<char *> outer_iterator_lb;
  std::vector<char *> outer_iterator_ub;
  int outer_for_level;
  /* Inter */
  std::vector<char *> inter_for_logic;
  std::vector<char *> inter_iterator_name;
  std::vector<char *> inter_iterator_lb;
  std::vector<char *> inter_iterator_ub;
  int inter_for_level;
  /* Intra */
  std::vector<char *> intra_for_logic;
  std::vector<char *> intra_iterator_name;
  std::vector<char *> intra_iterator_lb;
  std::vector<char *> intra_iterator_ub;
  int intra_for_level;
};

static __isl_give isl_printer *print_double_buffer_module_vars_while(
  __isl_take isl_printer *p, struct autosa_hw_module *module,
  struct hls_info *hls,
  struct print_db_module_while_data *data)
{
  /* Inst ids */
  if (!module->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  /* Local buffer */
  for (int i = 0; i < module->n_var; i++) {
    struct autosa_kernel_var *var = &module->var[i];
    p = isl_printer_start_line(p);
    if (var->n_lane == 1)
      p = isl_printer_print_str(p, var->array->type);
    else
    {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    p = isl_printer_print_str(p, "[2]");
    for (int j = 0; j < isl_vec_size(var->size); j++) {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  /* State handle variables */
  p = print_str_new_line(p, "bool arb = 0;");
  p = print_str_new_line(p, module->in? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
  p = print_str_new_line(p, module->in? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
  p = print_str_new_line(p, module->in? "bool inter_done = 0;" : "bool inter_done = 1;");
  p = print_str_new_line(p, module->in? "bool intra_done = 1;" : "bool intra_done = 0;");
  /* Iterators */
  for (int i = 0; i < data->outer_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->outer_iterator_name[i]);
    free(data->outer_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->outer_iterator_lb[i]);
    free(data->outer_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->outer_iterator_ub[i]);
    free(data->outer_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->inter_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->inter_iterator_name[i]);
    free(data->inter_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->inter_iterator_lb[i]);
    free(data->inter_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->inter_iterator_ub[i]);
    free(data->inter_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->intra_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->intra_iterator_name[i]);
    free(data->intra_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->intra_iterator_lb[i]);
    free(data->intra_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->intra_iterator_ub[i]);
    free(data->intra_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }

  p = print_str_new_line(p, "bool last_run = false;");

  return p;
}

/* Count the for level.
 */
static __isl_give isl_printer *count_module_for(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  isl_ast_node *body;

  if (data->inter == -1)
    data->outer_for_level++;
  else if (data->inter == 0)
    data->intra_for_level++;
  else if (data->inter == 1)
    data->inter_for_level++;

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}

/* Count the for level. A different implementation.
 * Currently only used for inter_trans module.
 * Since there might be if branches existing, only count one branch.
 * We assume the two branches are with the equal depth.
 */
static isl_bool count_module_for_alt(__isl_keep isl_ast_node *node, void *user) {
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  if (isl_ast_node_get_type(node) == isl_ast_node_if) {
    data->under_if = 1;
  }

  if (isl_ast_node_get_type(node) == isl_ast_node_for) {
    if (data->under_if == 0 || (data->under_if == 1 && data->reach_user == 0)) {
      data->inter_for_level++;
    }
  }
  if (isl_ast_node_get_type(node) == isl_ast_node_user) {
    data->reach_user = 1;
  }

  return isl_bool_true;
}

/* Extract the loop information.
 */
static __isl_give isl_printer *extract_module_for(__isl_take isl_printer *p,
                                                  __isl_take isl_ast_print_options *print_options,
                                                  __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  isl_ast_expr *iterator, *init, *cond, *ub;
  const char *iterator_suffix;
  isl_printer *p_local, *p_str;
  char *text;
  std::vector<char *> text_lines;
  isl_ast_node *body;

  p_local = data->p_for;

  /* Extract the lower bound and upper bound. */
  iterator = isl_ast_node_for_get_iterator(node);
  init = isl_ast_node_for_get_init(node);
  cond = isl_ast_node_for_get_cond(node);
  ub = isl_ast_expr_op_get_arg(cond, 1);

  p_str = isl_printer_to_str(isl_ast_node_get_ctx(node));
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
  //p_str = isl_printer_print_str(p_str, iterator_suffix);
  p_str = isl_printer_print_ast_expr(p_str, iterator);
  if (data->inter == -1)
    data->outer_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_name.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, ub);
  if (data->inter == -1)
    data->outer_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_ub.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, init);
  if (data->inter == -1)
    data->outer_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_lb.push_back(isl_printer_get_str(p_str));
  isl_printer_free(p_str);

  p_local = isl_printer_indent(p_local, -4);

  p_local = isl_printer_start_line(p_local);
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, "++;");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_start_line(p_local);
  p_local = isl_printer_print_str(p_local, "if (");
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " == ");
  p_local = isl_printer_print_ast_expr(p_local, ub);
  p_local = isl_printer_print_str(p_local, " + 1) {");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_indent(p_local, 4);
  p_local = isl_printer_start_line(p_local);
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " = ");
  p_local = isl_printer_print_ast_expr(p_local, init);
  p_local = isl_printer_print_str(p_local, ";");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  if (data->inter == -1)
    data->outer_for_logic.insert(data->outer_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 0)
    data->intra_for_logic.insert(data->intra_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 1)
    data->inter_for_logic.insert(data->inter_for_logic.begin(), text_lines.begin(), text_lines.end());

  isl_ast_expr_free(iterator);
  isl_ast_expr_free(init);
  isl_ast_expr_free(cond);
  isl_ast_expr_free(ub);

  p_local = isl_printer_indent(p_local, -4);

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}

static void extract_double_buffer_module_while_data(
  struct autosa_hw_module *module, int boundary,
  struct print_db_module_while_data *data)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = module->kernel->ctx;
  isl_printer *p_for, *p_user, *p;
  const char *for_logic, *user_logic;

  /* Outer module */
  data->inter = -1;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->outer_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);

  /* Extract the for and user logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->outer_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);
  isl_printer_free(p);
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Intra module */
  data->inter = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->intra_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  p = isl_ast_node_print(module->intra_tree, p, print_options);

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->intra_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  p = isl_ast_node_print(module->intra_tree, p, print_options);
  isl_printer_free(p);
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Inter module */
  data->inter = 1;
  data->under_if = 0;
  data->reach_user = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->inter_for_level = 0;

  /* Count the for level first. */
  if (!boundary) {
    isl_ast_node_foreach_descendant_top_down(module->inter_tree, &count_module_for_alt, data);
  } else {
    isl_ast_node_foreach_descendant_top_down(module->boundary_inter_tree, &count_module_for_alt, data);
  }

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->inter_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->inter_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_inter_tree, p, print_options);
  isl_printer_free(p);
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);
}

static __isl_give isl_printer *print_null_for(__isl_take isl_printer *p,
                                              __isl_take isl_ast_print_options *print_options,
                                              __isl_keep isl_ast_node *node, void *user)
{
  isl_ast_node *body;

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}

/* Print the inter_trans module in double buffer mode.
 */
static __isl_give isl_printer *autosa_print_inter_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, "inter_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Print the intra_trans module in double buffer mode.
 */
static __isl_give isl_printer *autosa_print_intra_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, "intra_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Print the double buffer module using while loops instead of for loops.
 * First, we will change the buffer to
 * local_buffer[2][...][...].
 *
 * Specifically, when handling a code structure:
 * [outer for loops]
 * for ...
 *   for ...
 * [outer for loops]
 * {
 *   if (arb) {
 *     ld(local_buffer_ping, ld_en);
 *     st(local_buffer_pong, st_en);
 *   else {
 *     ld(local_buffer_pong, ld_en);
 *     st(local_buffer_ping, st_en);
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   [state handle logic]
 * }
 * [last batch]
 * if (arb) {
 *   st(local_buffer_pong, st_en);
 * } else {
 *   st(local_buffer_ping, st_en);
 * }
 * [last batch]
 * We will convert it to a new code structure:
 * while (1) {
 *   if (ld_en) {
 *     [inlined logic]
 *     ld(local_buffer[arb][...]);
 *     [inlined logic]
 *   }
 *   if (st_en) {
 *     [inlined logic]
 *     st(local_buffer[!arb][...]);
 *     [inlined logic]
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   ld_en = 1;
 *   st_en = 1;
 *   [state handle logic]
 *   [outer for loops]
 *   outer_iter0++;
 *   if (outer_iter0 == ...) {
 *     outer_iter0 = 0;
 *     [last batch]
 *     ld_en = 0;
 *     [last batch]
 *   }
 *   [outer for loops]
 * }
 *
 * Note that this only works if each for loop structure is a perfectly
 * nested loop so that we could convert to a while loop.
 */
static __isl_give isl_printer *print_double_buffer_module_while(
  __isl_take isl_printer *p, struct autosa_hw_module *module,
  struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;
  } else {
    if (!module->boundary_tree)
      return p;
  }

  struct print_db_module_while_data print_data;

  /* Extract the code snippets. */
  extract_double_buffer_module_while_data(module, boundary, &print_data);

  /* Print header */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_tapa(prog, module, hls, -1, boundary);
  p = print_str_new_line(p, "{");
  p = isl_printer_indent(p, 2);

  /* Print variables */
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = print_double_buffer_module_vars_while(p, module, hls, &print_data);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  /* Print content */
  p = print_str_new_line(p, "while (1) {");
  p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
  p = isl_printer_indent(p, 2);

  /* Print inter_trans */
  p = print_str_new_line(p, "if (inter_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_inter_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */
  for (int i = 0; i < print_data.inter_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.inter_for_logic[i]);
    free(print_data.inter_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.inter_for_level);
  p = print_str_new_line(p, "inter_done = 1;");
  p = print_str_new_line(p, "inter_trans_en = 0;");
  for (int i = 0; i < print_data.inter_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print intra_trans */
  p = print_str_new_line(p, "if (intra_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_intra_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */
  for (int i = 0; i < print_data.intra_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.intra_for_logic[i]);
    free(print_data.intra_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.intra_for_level);
  p = print_str_new_line(p, "intra_done = 1;");
  p = print_str_new_line(p, "intra_trans_en = 0;");
  for (int i = 0; i < print_data.intra_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print state_handle */
  p = print_str_new_line(p, "if (inter_done && intra_done) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "if (last_run) break;");
  p = print_str_new_line(p, "intra_trans_en = 1;");
  p = print_str_new_line(p, "inter_trans_en = 1;");
  p = print_str_new_line(p, "intra_done = 0;");
  p = print_str_new_line(p, "inter_done = 0;");
  p = print_str_new_line(p, "arb = !arb;");
  /* Print the loop counter */
  for (int i = 0; i < print_data.outer_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.outer_for_logic[i]);
    free(print_data.outer_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.outer_for_level);
  p = print_str_new_line(p, module->in? "inter_trans_en = 0;" : "intra_trans_en = 0;");
  p = print_str_new_line(p, module->in? "inter_done = 1;" : "intra_done = 1;");
  p = print_str_new_line(p, "last_run = true;");
  for (int i = 0; i < print_data.outer_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *autosa_print_host_code(__isl_take isl_printer *p,
                                                      struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                                      struct autosa_hw_module **modules, int n_modules,
                                                      struct autosa_hw_top_module *top,
                                                      struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                      struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(tree);
  struct print_host_user_data data = {hls, prog, top};
  struct print_hw_module_data hw_data = {hls, prog, NULL};
  isl_printer *p_module;

  /* Print the data pack types in the program. */
  print_data_types_tapa(top, hls);

  /* Print the macros for sparse data structure */
  if (prog->scop->options->autosa->block_sparse) {
    print_sparse_macros(top->kernel, hls);
  }

  /* Print the helper functions in the program. */
  print_drain_merge_funcs(top->kernel, drain_merge_funcs, n_drain_merge_funcs, hls);

  /* Print the host data serialization function. */
  print_host_serialize_funcs(top->kernel, modules, n_modules, hls); // TODO

  /* Print the default AST. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_host_user_tapa, &data);

  /* Print the macros definitions in the program. */
  p = autosa_print_macros(p, tree);
  p = isl_ast_node_print(tree, p, print_options);

  /* Print the hw module ASTs. */
  p_module = isl_printer_to_file(ctx, hls->kernel_c);
  p_module = isl_printer_set_output_format(p_module, ISL_FORMAT_C);

  for (int i = 0; i < n_modules; i++)
  {
    if (modules[i]->double_buffer && modules[i]->options->autosa->double_buffer_style == 0)
    {
      p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 0);
      if (modules[i]->boundary) {
        p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 1);
      }
    } else {
      if (modules[i]->is_filter && modules[i]->is_buffer)
      {
        /* Print out the definitions for inter_trans and intra_trans function calls. */
        /* Intra transfer function */
        p_module = autosa_print_intra_trans_module(p_module, modules[i], prog, hls, 0);

        /* Inter transfer function */
        p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 0);
        if (modules[i]->boundary)
          p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 1);
      }

      p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 0);

      if (modules[i]->boundary)
      {
        /* Print out the definitions for boundary trans function calls. */
        p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 1);
      }

      if (modules[i]->n_pe_dummy_modules > 0)
      {
        /* Print out the definitions for pe dummy function calls. */
        for (int j = 0; j < modules[i]->n_pe_dummy_modules; j++)
        {
          p_module = autosa_print_default_pe_dummy_module(
              p_module, modules[i]->pe_dummy_modules[j], prog, hls, 0);
        }
      }
    }
  }
  isl_printer_free(p_module);

  return p;
}

static __isl_give isl_printer *print_top_module_headers_tapa(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls)
{
  struct autosa_kernel *kernel = top->kernel;

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"void kernel");
  p = isl_printer_print_int(p, 0);
  p = isl_printer_print_str(p, "(");
  p = print_kernel_arguments(p, prog, top->kernel, 1, hls);
  p = isl_printer_print_str(p, ")\");");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"{\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  return p;
}

static char *extract_fifo_name_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static char *extract_fifo_width_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    loc++;
  }

  loc++;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static __isl_give isl_printer *print_top_module_fifo_stmt(__isl_take isl_printer *p,
                                                          __isl_take isl_ast_print_options *print_options,
                                                          __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_FIFO_DECL:
    return autosa_kernel_print_fifo_decl(p, stmt, data->prog, data->hls);
  }

  return p;
}

static __isl_give isl_printer *print_top_module_call_stmt(
  __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_MODULE_CALL:
    return autosa_kernel_print_module_call(p, stmt, data->prog, data->hls->target);
  }

  return p;
}

/* This function prints the code that prints out the top function that
 * calls the hardware modules and declares the fifos.
 */
static void print_top_gen_host_code(
    struct autosa_prog *prog, __isl_keep isl_ast_node *node,
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  isl_printer *p;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;
  struct print_hw_module_data hw_data = {hls, prog, NULL};

  /* Print the top module ASTs. */
  p = isl_printer_to_file(ctx, hls->top_gen_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);

  print_top_gen_headers(prog, top, hls);
  fprintf(hls->top_gen_c, " {\n");
  p = isl_printer_indent(p, 2);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *fd = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/resource_est/design_info.dat\", \"w\");");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int fifo_cnt;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx *ctx = isl_ctx_alloc();");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer *p = isl_printer_to_file(ctx, f);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  p = print_top_module_headers_tapa(p, prog, top, hls);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, 2);");
  p = isl_printer_end_line(p);

  /* Print FIFO declarations */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* Print the serialize fifos if existing. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    struct autosa_array_ref_group *group = module->io_groups[0];
    if (module->is_serialized) {
      /* Generate fifo decl counter. */
      char *fifo_name;
      int fifo_w;  // bytes
      fifo_w = module->data_pack_inter * group->array->size;
      isl_printer *p_str;
      p_str = isl_printer_to_str(ctx);
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      p_str = isl_printer_print_str(p_str, "_");
      p_str = isl_printer_print_str(p_str, module->name);
      p_str = isl_printer_print_str(p_str, "_serialize");
      fifo_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      p = print_str_new_line(p, "fifo_cnt = 1;");
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* ");
      p = isl_printer_print_str(p, module->name);
      p = isl_printer_print_str(p, "_serialize fifo */ ");
      p = print_fifo_type_tapa(p, group, module->data_pack_inter, fifo_depth, NULL);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, ";\");");
      p = isl_printer_end_line(p);
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");

      if (group->local_array->is_sparse) {
        p = print_str_new_line(p, "p = isl_printer_start_line(p);");
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS DATA_PACK variable=");
        p = isl_printer_print_str(p, fifo_name);
        p = isl_printer_print_str(p, "\");");
        p = isl_printer_end_line(p);
        p = print_str_new_line(p, "p = isl_printer_end_line(p);");
      }

      /* fifo:fifo_name:fifo_cnt:fifo_width */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, ":\%d:");
      p = isl_printer_print_int(p, fifo_w);
      p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
      p = isl_printer_end_line(p);

      p = isl_printer_end_line(p);
      free(fifo_name);
    }
  }

  for (int i = 0; i < top->n_fifo_decls; i++) {
    /* Generate fifo decl counter. */
    char *fifo_decl_name = top->fifo_decl_names[i];
    char *fifo_name = extract_fifo_name_from_fifo_decl_name(ctx, fifo_decl_name);
    char *fifo_w = extract_fifo_width_from_fifo_decl_name(ctx, fifo_decl_name);
    p = print_str_new_line(p, "fifo_cnt = 0;");

    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_fifo_stmt, &hw_data);

    p = isl_ast_node_print(top->fifo_decl_wrapped_trees[i],
                           p, print_options);

    /* fifo:fifo_name:fifo_cnt:fifo_width */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ":\%d:");
    p = isl_printer_print_str(p, fifo_w);
    p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);

    free(fifo_name);
    free(fifo_w);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  int n_module_names = 0;
  char **module_names = NULL;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    /* Generate module call counter. */
    struct autosa_hw_module *module = top->hw_modules[i];
    char *module_name;

    if (module->is_filter && module->is_buffer)
    {
      module_name = concat(ctx, module->name, "intra_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      module_name = concat(ctx, module->name, "inter_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      if (module->boundary)
      {
        module_name = concat(ctx, module->name, "inter_trans_boundary");

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    module_name = strdup(module->name);

    n_module_names++;
    module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
    module_names[n_module_names - 1] = module_name;

    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "boundary");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }

    if (module->n_pe_dummy_modules > 0)
    {
      for (int j = 0; j < module->n_pe_dummy_modules; j++)
      {
        struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[j];
        struct autosa_array_ref_group *group = dummy_module->io_group;
        isl_printer *p_str = isl_printer_to_str(ctx);
        p_str = autosa_array_ref_group_print_prefix(group, p_str);
        p_str = isl_printer_print_str(p_str, "_PE_dummy");
        p_str = isl_printer_print_str(p_str, dummy_module->in? "_in" : "_out");
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    if (module->is_serialized) {
      if (module->boundary)
        module_name = concat(ctx, module->name, "boundary_serialize");
      else
        module_name = concat(ctx, module->name, "serialize");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }
  }
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt = 0;");
    p = isl_printer_end_line(p);
  }

  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"  tapa::task()\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  /* Print module calls. */
  for (int i = 0; i < top->n_module_calls; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_call_stmt, &hw_data);

    p = isl_ast_node_print(top->module_call_wrapped_trees[i],
                           p, print_options);
  }

  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"  ;\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  /* module:module_name:module_cnt. */
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"module:");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, ":\%d\\n\", ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt);");
    p = isl_printer_end_line(p);
  }
  p = isl_printer_end_line(p);

  for (int i = 0; i < n_module_names; i++)
  {
    free(module_names[i]);
  }
  free(module_names);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fclose(fd);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer_free(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx_free(ctx);");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "}");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* For internal testing only. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int main()");
  p = isl_printer_end_line(p);

  p = ppcg_start_block(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *f = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/src/top.cpp\", \"w\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "top_generate(f);");
  p = isl_printer_end_line(p);

  p = ppcg_end_block(p);
  p = isl_printer_free(p);

  return;
}

/* Given a autosa_prog "prog" and the corresponding tranformed AST
 * "tree", print the entire OpenCL/HLS code to "p".
 * "types" collects the types for which a definition has already been
 * printed.
 */
static __isl_give isl_printer *print_hw(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
    struct autosa_hw_module **modules, int n_modules,
    struct autosa_hw_top_module *top_module,
    struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
    struct autosa_types *types, void *user)
{
  struct hls_info *hls = (struct hls_info *)user;
  isl_printer *p_tmp;

  p_tmp = isl_printer_to_file(isl_printer_get_ctx(p), hls->kernel_c);
  p_tmp = isl_printer_set_output_format(p_tmp, ISL_FORMAT_C);
  p_tmp = autosa_print_types(p_tmp, types, prog);
  p_tmp = isl_printer_free(p_tmp);

  /* Print OpenCL host and kernel function. */
  p = autosa_print_host_code(p, prog, tree, modules, n_modules, top_module,
                             drain_merge_funcs, n_drain_merge_funcs, hls);
  /* Print seperate top module code generation function. */
  print_top_gen_host_code(prog, tree, top_module, hls);

  return p;
}

/* Generate systolic arrays for TAPA C++
 */
int generate_autosa_tapa_cpp(isl_ctx *ctx, struct ppcg_options *options,
                                 const char *input)
{
  struct hls_info hls;
  int r;

  hls.target = TAPA_HW;
  hls.hls = false;
  hls.hcl = false;
  hls.ctx = ctx;
  hls.output_dir = options->autosa->output_dir;
  hls_open_files(&hls, input);

  r = generate_sa(ctx, input, hls.host_c, options, &print_hw, &hls);

  hls_close_files(&hls);

  return r;
}


================================================
FILE: src/autosa_tapa_cpp.h
================================================
#ifndef _AUTOSA_TAPA_CPP_H
#define _AUTOSA_TAPA_CPP_H

#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

int generate_autosa_tapa_cpp(isl_ctx *ctx, struct ppcg_options *options,
        const char *input);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/autosa_trans.cpp
================================================
#include <string>
#include <exception>
//#include <chrono>
//using namespace std::chrono;

#include "autosa_trans.h"
#include "autosa_utils.h"
#include "autosa_schedule_tree.h"
#include "autosa_comm.h"
#include "autosa_codegen.h"
#include "autosa_print.h"
#include "cpu.h"

/* A program is legal to be transformed to systolic array if and only if 
 * it satisfies the following constraints:
 * - one single fully permutable outermost band
 * - uniform dependency
 */
isl_bool sa_legality_check(__isl_keep isl_schedule *schedule, struct ppcg_scop *scop)
{
    isl_bool single_band;
    enum isl_schedule_node_type type;

    /* Check if the root node point to a band node */
    isl_schedule_node *node = isl_schedule_get_root(schedule);
    node = isl_schedule_node_child(node, 0);
    type = isl_schedule_node_get_type(node);
    single_band = (type == isl_schedule_node_band) ? isl_bool_true : isl_bool_false;
    isl_schedule_node_free(node);
    if (!single_band)
    {
        throw std::runtime_error("[AutoSA] Error: Single outermost permutable band not found.");
    }

    //DBGSCHD(stdout, schedule, isl_schedule_get_ctx(schedule))

    /* Check if all flow and rar dependences are uniform. */
    isl_bool all_uniform_dep = uniform_dep_check(schedule, scop);
    if (all_uniform_dep < 1)
    {
        throw std::runtime_error("[AutoSA] Error: Non-uniform dependence detected.");
    }    

    return isl_bool_true;
}

/* Load the tuning configuration file.  
 */
static cJSON *load_tuning_config(char *config_file)
{
    FILE *f;
    char *buffer = NULL;
    cJSON *config = NULL;
    long length;

    f = fopen(config_file, "rb");
    if (f)
    {
        fseek(f, 0, SEEK_END);
        length = ftell(f);
        fseek(f, 0, SEEK_SET);
        buffer = (char *)malloc(length + 1);
        if (buffer)
        {
            buffer[length] = '\0';
            int r = fread(buffer, 1, length, f);
        }
        fclose(f);
    }
    else
    {
        printf("[AutoSA] Error: Can't open configuration file: %s\n", config_file);
        exit(1);
    }

    if (buffer)
    {
        config = cJSON_Parse(buffer);
        free(buffer);
    }

    return config;
}

/* Generate asyncrhonized systolic arrays with the given dimension.
 * For sync arrays, time loops are placed inside the space loops.
 * We will first select space loop candidates from the outermost loop band 
 * which carry dependences with distance less than or equal to 1. 
 * Then we will enumerate different space loop combinations by picking up "dim" 
 * space loops from the candidate pool.
 */
struct autosa_kernel **sa_space_time_transform_at_dim_async(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa, isl_size num_sa_offset)
{
    struct autosa_kernel **sas = NULL;

    /* Select space loop candidates.
     * Space loops carry dependences with distance less or equal to 1.
     */
    isl_schedule_node *band = get_outermost_permutable_node(schedule);
    isl_size band_w = isl_schedule_node_band_n_member(band);
    isl_size *is_space_loop = (isl_size *)malloc(band_w * sizeof(isl_size));
    isl_union_map *dep_flow = scop->dep_flow;
    isl_union_map *dep_rar = scop->dep_rar;
    isl_union_map *dep_total = isl_union_map_union(isl_union_map_copy(dep_flow),
                                                   isl_union_map_copy(dep_rar));
    isl_basic_map_list *deps = isl_union_map_get_basic_map_list(dep_total);
    isl_size ndeps = isl_union_map_n_basic_map(dep_total);

    for (int h = 0; h < band_w; h++)
    {
        int n;
        for (n = 0; n < ndeps; n++)
        {
            isl_basic_map *dep = isl_basic_map_list_get_basic_map(deps, n);
            isl_vec *dep_dis = get_dep_dis_at_node(dep, band);
            isl_val *val = isl_vec_get_element_val(dep_dis, h);
            if (!(isl_val_is_one(val) || isl_val_is_zero(val)))
            {
                isl_vec_free(dep_dis);
                isl_val_free(val);
                isl_basic_map_free(dep);
                break;
            }

            isl_val_free(val);
            isl_vec_free(dep_dis);
            isl_basic_map_free(dep);
        }
        is_space_loop[h] = (n == ndeps);
    }

    /* Perform loop permutation to generate all candidates. */
    if (dim == 1)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {                  
                TuningProgram *tuning_program = new TuningProgram;      
                tuning_program->id = *num_sa + num_sa_offset;
                tuning_program->load_param_names(scop->options->autosa->param_names);
                isl_schedule *new_schedule = isl_schedule_dup(schedule);
                new_schedule = tuning_program->init_from_schedule(new_schedule);
                isl_schedule_node *band = get_outermost_permutable_node(new_schedule);
                isl_schedule_free(new_schedule);

                /* Make the loop i the outermost loop. */
                for (int d = i; d > 0; d--)
                {                    
                    band = loop_interchange_at_node(band, d, d - 1);
                }
                new_schedule = isl_schedule_node_get_schedule(band);
                isl_schedule_node_free(band);

                /* Update the hyperplane types. */
                struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                sa->scop = scop;
                sa->type = AUTOSA_SA_TYPE_ASYNC;

                /* Update the array dimension. */
                sa->n_sa_dim = dim;
                sa->array_part_w = 0;
                sa->space_w = dim;
                // TODO: incorrect, to fix.
                sa->time_w = band_w - dim;
                sa->tuning_program = tuning_program;                                

                /* Add the new variant into the list. */
                sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                sizeof(struct autosa_kernel *));
                sas[*num_sa] = sa;
                *num_sa = *num_sa + 1;
            }
        }
    }
    else if (dim == 2)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {
                for (int j = i + 1; j < band_w; j++)
                {
                    if (is_space_loop[j])
                    {
                        TuningProgram *tuning_program = new TuningProgram;                        
                        tuning_program->id = *num_sa + num_sa_offset;
                        tuning_program->load_param_names(scop->options->autosa->param_names);                        
                        isl_schedule *new_schedule = isl_schedule_dup(schedule);
                        new_schedule = tuning_program->init_from_schedule(new_schedule);
                        isl_schedule_node *band = get_outermost_permutable_node(new_schedule);                        
                        isl_schedule_free(new_schedule);

                        /* Make the loop i, j the outermost loops. */
                        for (int d = j; d > 0; d--)
                        {                            
                            band = loop_interchange_at_node(band, d, d - 1);
                        }
                        for (int d = i + 1; d > 0; d--)
                        {                         
                            band = loop_interchange_at_node(band, d, d - 1);
                        }
                        new_schedule = isl_schedule_node_get_schedule(band);
                        isl_schedule_node_free(band);

                        /* Update the hyperplane types. */
                        struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                        sa->scop = scop;
                        sa->type = AUTOSA_SA_TYPE_ASYNC;

                        /* Update the array dimension. */
                        sa->n_sa_dim = dim;
                        sa->array_part_w = 0;
                        sa->space_w = dim;
                        // TODO: incorrect, to fix.
                        sa->time_w = band_w - dim;
                        sa->tuning_program = tuning_program;

                        /* Add the new variant into the list. */
                        sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                        sizeof(struct autosa_kernel *));
                        sas[*num_sa] = sa;
                        *num_sa = *num_sa + 1;
                    }
                }
            }
        }
    }
    else if (dim == 3)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {
                for (int j = i + 1; j < band_w; j++)
                {
                    if (is_space_loop[j])
                    {
                        for (int k = j + 1; k < band_w; k++)
                        {
                            if (is_space_loop[k])
                            {
                                TuningProgram *tuning_program = new TuningProgram;                                
                                tuning_program->id = *num_sa + num_sa_offset;
                                tuning_program->load_param_names(scop->options->autosa->param_names);                                
                                isl_schedule *new_schedule = isl_schedule_dup(schedule);
                                new_schedule = tuning_program->init_from_schedule(new_schedule);
                                isl_schedule_node *band = get_outermost_permutable_node(new_schedule);
                                isl_schedule_free(new_schedule);

                                /* Make the loop i, j, k the outermost loops. */
                                for (int d = k; d > 0; d--)
                                {                                    
                                    band = loop_interchange_at_node(band, d, d - 1);
                                }
                                for (int d = j + 1; d > 0; d--)
                                {                                    
                                    band = loop_interchange_at_node(band, d, d - 1);
                                }
                                for (int d = i + 2; d > 0; d--)
                                {                                 
                                    band = loop_interchange_at_node(band, d, d - 1);
                                }
                                new_schedule = isl_schedule_node_get_schedule(band);
                                isl_schedule_node_free(band);

                                /* Update the hyperplane types. */
                                struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                                sa->scop = scop;
                                sa->type = AUTOSA_SA_TYPE_ASYNC;

                                /* Update the array dimension. */
                                sa->n_sa_dim = dim;
                                sa->array_part_w = 0;
                                sa->space_w = dim;
                                // TODO: incorrect, to fix.
                                sa->time_w = band_w - dim;
                                sa->tuning_program = tuning_program;

                                /* Add the new variant into the list. */
                                sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                                sizeof(struct autosa_kernel *));
                                sas[*num_sa] = sa;
                                *num_sa = *num_sa + 1;
                            }
                        }
                    }
                }
            }
        }
    }

    isl_basic_map_list_free(deps);
    isl_union_map_free(dep_total);
    isl_schedule_node_free(band);
    free(is_space_loop);

    return sas;
}

/* Generate syncrhonized systolic arrays with the given dimension.
 * For sync arrays, time loops are placed outside the space loops.
 * We will first select space loop candidates from the innermost loop band 
 * which carry dependences with distance less than or equal to 1. 
 * Then we will enumerate different space loop combinations by picking up "dim" 
 * space loops from the candidate pool.
 */
struct autosa_kernel **sa_space_time_transform_at_dim_sync(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa)
{
    struct autosa_kernel **sas = NULL;

    /* Select space loop candidates.
   * Space loops carry dependences with distance less or equal to 1.
   */
    isl_schedule_node *band = get_innermost_permutable_node(schedule);
    isl_size band_w = isl_schedule_node_band_n_member(band);
    isl_size *is_space_loop = (isl_size *)malloc(band_w * sizeof(isl_size));
    isl_union_map *dep_flow = scop->dep_flow;
    isl_union_map *dep_rar = scop->dep_rar;
    isl_union_map *dep_total = isl_union_map_union(isl_union_map_copy(dep_flow),
                                                   isl_union_map_copy(dep_rar));
    isl_basic_map_list *deps = isl_union_map_get_basic_map_list(dep_total);
    isl_size ndeps = isl_union_map_n_basic_map(dep_total);

    for (int h = 0; h < band_w; h++)
    {
        int n;
        for (n = 0; n < ndeps; n++)
        {
            isl_basic_map *dep = isl_basic_map_list_get_basic_map(deps, n);
            isl_vec *dep_dis = get_dep_dis_at_node(dep, band);
            isl_val *val = isl_vec_get_element_val(dep_dis, h);
            if (!(isl_val_is_one(val) || isl_val_is_zero(val)))
            {
                isl_vec_free(dep_dis);
                isl_val_free(val);
                isl_basic_map_free(dep);
                break;
            }

            isl_val_free(val);
            isl_vec_free(dep_dis);
            isl_basic_map_free(dep);
        }
        is_space_loop[h] = (n == ndeps);
    }

    /* Perform loop permutation to generate all candidates. */
    if (dim == 1)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {
                isl_schedule *new_schedule = isl_schedule_dup(schedule);
                isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                isl_schedule_free(new_schedule);

                /* Make the loop i the innermost loop. */
                for (int d = i; d < band_w - 1; d++)
                {
                    //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                    //isl_schedule_free(new_schedule);
                    //new_schedule = loop_interchange_at_node(band, d, d + 1);
                    band = loop_interchange_at_node(band, d, d + 1);
                }
                new_schedule = isl_schedule_node_get_schedule(band);
                isl_schedule_node_free(band);

                /* Update the hyperplane types. */
                struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                sa->scop = scop;
                sa->type = AUTOSA_SA_TYPE_SYNC;

                /* Update the array dimension. */
                sa->n_sa_dim = dim;
                sa->array_part_w = 0;
                sa->space_w = dim;
                // TODO: this is incorrect, we need to consider other loop bands.
                sa->time_w = band_w - dim;

                /* Add the new variant into the list. */
                sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                sizeof(struct autosa_kernel *));
                sas[*num_sa] = sa;
                *num_sa = *num_sa + 1;
            }
        }
    }
    else if (dim == 2)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {
                for (int j = i + 1; j < band_w; j++)
                {
                    if (is_space_loop[j])
                    {
                        isl_schedule *new_schedule = isl_schedule_dup(schedule);
                        isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                        isl_schedule_free(new_schedule);

                        /* Make the loop i, j the innermost loops. */
                        for (int d = i; d < band_w - 1; d++)
                        {
                            //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                            //isl_schedule_free(new_schedule);
                            //new_schedule = loop_interchange_at_node(band, d, d + 1);
                            band = loop_interchange_at_node(band, d, d + 1);
                        }
                        for (int d = j - 1; d < band_w - 1; d++)
                        {
                            //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                            //isl_schedule_free(new_schedule);
                            //new_schedule = loop_interchange_at_node(band, d, d + 1);
                            band = loop_interchange_at_node(band, d, d + 1);
                        }
                        new_schedule = isl_schedule_node_get_schedule(band);
                        isl_schedule_node_free(band);

                        /* Update the hyperplane types. */
                        struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                        sa->scop = scop;
                        sa->type = AUTOSA_SA_TYPE_SYNC;

                        /* Update the array dimension. */
                        sa->n_sa_dim = dim;
                        sa->array_part_w = 0;
                        sa->space_w = dim;
                        // TODO: incorrect, to fix.
                        sa->time_w = band_w - dim;

                        /* Add the new variant into the list. */
                        sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                        sizeof(struct autosa_kernel *));
                        sas[*num_sa] = sa;
                        *num_sa = *num_sa + 1;
                    }
                }
            }
        }
    }
    else if (dim == 3)
    {
        for (int i = 0; i < band_w; i++)
        {
            if (is_space_loop[i])
            {
                for (int j = i + 1; j < band_w; j++)
                {
                    if (is_space_loop[j])
                    {
                        for (int k = j + 1; k < band_w; k++)
                        {
                            if (is_space_loop[k])
                            {
                                isl_schedule *new_schedule = isl_schedule_dup(schedule);
                                isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                                isl_schedule_free(new_schedule);

                                /* Make the loop i, j, k the innermost loops. */
                                for (int d = i; d < band_w - 1; d++)
                                {
                                    //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                                    //isl_schedule_free(new_schedule);
                                    //new_schedule = loop_interchange_at_node(band, d, d + 1);
                                    band = loop_interchange_at_node(band, d, d + 1);
                                }
                                for (int d = j - 1; d < band_w - 1; d++)
                                {
                                    //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                                    //isl_schedule_free(new_schedule);
                                    //new_schedule = loop_interchange_at_node(band, d, d + 1);
                                    band = loop_interchange_at_node(band, d, d + 1);
                                }
                                for (int d = k - 2; d < band_w - 1; d++)
                                {
                                    //isl_schedule_node *band = get_innermost_permutable_node(new_schedule);
                                    //isl_schedule_free(new_schedule);
                                    //new_schedule = loop_interchange_at_node(band, d, d + 1);
                                    band = loop_interchange_at_node(band, d, d + 1);
                                }
                                new_schedule = isl_schedule_node_get_schedule(band);
                                isl_schedule_node_free(band);

                                /* Update the hyperplane types. */
                                struct autosa_kernel *sa = autosa_kernel_from_schedule(new_schedule);
                                sa->scop = scop;
                                sa->type = AUTOSA_SA_TYPE_SYNC;

                                /* Update the array dimension. */
                                sa->n_sa_dim = dim;
                                sa->array_part_w = 0;
                                sa->space_w = dim;
                                sa->time_w = band_w - dim;

                                /* Add the new variant into the list. */
                                sas = (struct autosa_kernel **)realloc(sas, (*num_sa + 1) *
                                                                                sizeof(struct autosa_kernel *));
                                sas[*num_sa] = sa;
                                *num_sa = *num_sa + 1;
                            }
                        }
                    }
                }
            }
        }
    }

    isl_basic_map_list_free(deps);
    isl_union_map_free(dep_total);
    isl_schedule_node_free(band);
    free(is_space_loop);

    return sas;
}

/* Generate systolic array with "dim" space dimensions. 
 * Depending on the systolic array type set by users, we will generate 
 * async or sync arrays.
 */
struct autosa_kernel **sa_space_time_transform_at_dim(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa, isl_size num_sa_offset)
{
    if (scop->options->autosa->sa_type == AUTOSA_SA_TYPE_ASYNC)
    {
        return sa_space_time_transform_at_dim_async(schedule, scop, dim, num_sa, num_sa_offset);
    }
    else if (scop->options->autosa->sa_type == AUTOSA_SA_TYPE_SYNC)
    {
        return sa_space_time_transform_at_dim_sync(schedule, scop, dim, num_sa);
    }

    return NULL;
}

/* Apply space-time transformation to generate different systolic array candidates. */
struct autosa_kernel **sa_space_time_transform(__isl_take isl_schedule *schedule,
                                               struct ppcg_scop *scop, isl_size *num_sa)
{
    struct autosa_kernel **sa_list = NULL;
    isl_size n_sa = 0;
    isl_schedule_node *band = get_outermost_permutable_node(schedule);
    isl_size band_w = isl_schedule_node_band_n_member(band);
    if (band_w <= 0) {
        isl_schedule_free(schedule);
        *num_sa = 0;
        return NULL;
    }

    /* Explore 1D systolic array */
    if (scop->options->autosa->max_sa_dim >= 1 && band_w >= 1)
    {
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] Explore 1D systolic array.\n");
        }
        isl_size n_sa_dim = 0;
        struct autosa_kernel **sa_dim_list = sa_space_time_transform_at_dim(
            schedule, scop, 1, &n_sa_dim, n_sa);
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] %d candidates generated.\n", n_sa_dim);
        }
        sa_list = (struct autosa_kernel **)realloc(sa_list,
                                                   (n_sa + n_sa_dim) * sizeof(struct autosa_kernel *));
        for (int i = 0; i < n_sa_dim; i++)
        {
            sa_list[n_sa + i] = sa_dim_list[i];
            sa_list[n_sa + i]->space_time_id = n_sa + i;            
        }
        free(sa_dim_list);
        n_sa += n_sa_dim;
    }
    /* Explore 2D systolic array */
    if (scop->options->autosa->max_sa_dim >= 2 && band_w >= 2)
    {
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] Explore 2D systolic array.\n");
        }
        isl_size n_sa_dim = 0;
        struct autosa_kernel **sa_dim_list = sa_space_time_transform_at_dim(
            schedule, scop, 2, &n_sa_dim, n_sa);
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] %d candidates generated.\n", n_sa_dim);
        }
        sa_list = (struct autosa_kernel **)realloc(sa_list,
                                                   (n_sa + n_sa_dim) * sizeof(struct autosa_kernel *));
        for (int i = 0; i < n_sa_dim; i++)
        {
            sa_list[n_sa + i] = sa_dim_list[i];
            sa_list[n_sa + i]->space_time_id = n_sa + i;            
        }
        free(sa_dim_list);
        n_sa += n_sa_dim;
    }
    /* Explore 3D systolic array */
    if (scop->options->autosa->max_sa_dim >= 3 && band_w >= 3)
    {
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] Explore 3D systolic array.\n");
        }
        isl_size n_sa_dim = 0;
        struct autosa_kernel **sa_dim_list = sa_space_time_transform_at_dim(
            schedule, scop, 3, &n_sa_dim, n_sa);
        if (scop->options->autosa->verbose)
        {
            printf("[AutoSA] %d candidates generated.\n", n_sa_dim);
        }
        sa_list = (struct autosa_kernel **)realloc(sa_list,
                                                   (n_sa + n_sa_dim) * sizeof(struct autosa_kernel *));
        for (int i = 0; i < n_sa_dim; i++)
        {
            sa_list[n_sa + i] = sa_dim_list[i];
            sa_list[n_sa + i]->space_time_id = n_sa + i;            
        }
        free(sa_dim_list);
        n_sa += n_sa_dim;
    }

    isl_schedule_free(schedule);
    isl_schedule_node_free(band);
    *num_sa = n_sa;
    /* Assign the kernel id */
    for (int i = 0; i < n_sa; i++)
    {
        sa_list[i]->id = i;
    }

    return sa_list;
}

/* Initialize the space_time to autosa_loop_time, 
 * and pe_opt to autosa_loop_default for all band nodes. */
static __isl_give isl_schedule_node *init_band_node_sa_properties(
    __isl_take isl_schedule_node *node, void *user)
{
    if (!node)
        return NULL;

    struct autosa_kernel *sa = (struct autosa_kernel *)(user);

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        int band_w = isl_schedule_node_band_n_member(node);
        /* Initialize the SA properties. */
        for (int i = 0; i < band_w; i++)
        {
            node = isl_schedule_node_band_member_set_space_time(node, i, autosa_loop_time);
            node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_default);
            //node = isl_schedule_node_band_member_set_sched_pos(node, i, -1);
        }
    }

    return node;
}

/* Initialize the fields of time_space and pe_opt for each band node in the 
 * schedule tree. */
isl_stat sa_loop_init(struct autosa_kernel *sa)
{
    isl_schedule *schedule = sa->schedule;
    isl_schedule_node *root = isl_schedule_get_root(schedule);
    root = isl_schedule_node_map_descendant_bottom_up(root,
                                                      &init_band_node_sa_properties, sa);

    schedule = isl_schedule_node_get_schedule(root);
    isl_schedule_node_free(root);
    isl_schedule_free(sa->schedule);
    sa->schedule = schedule;

    return isl_stat_ok;
}

/* Set up the space_time properties. 
 * As all the loops are initialized to be the time loop in the sa_loop_init(),
 * only the space loops are to be set.
 */
isl_stat sa_space_time_loop_setup(struct autosa_kernel *sa)
{
    isl_schedule_node *node;
    if (sa->type == AUTOSA_SA_TYPE_SYNC)
    {
        node = get_innermost_permutable_node(sa->schedule);
        int dim = 0;
        for (int i = isl_schedule_node_band_n_member(node) - sa->space_w;
             i < isl_schedule_node_band_n_member(node); i++)
        {
            node = isl_schedule_node_band_member_set_space_time(node, i, autosa_loop_space);
            sa->space_parallel[dim] = isl_schedule_node_band_member_get_coincident(node, i);
            dim++;
        }
    }
    else if (sa->type == AUTOSA_SA_TYPE_ASYNC)
    {
        node = get_outermost_permutable_node(sa->schedule);
        int dim = 0;
        for (int i = 0; i < sa->space_w; i++)
        {
            node = isl_schedule_node_band_member_set_space_time(node, i, autosa_loop_space);
            sa->space_parallel[dim] = isl_schedule_node_band_member_get_coincident(node, i);
            dim++;
        }
    }

    isl_schedule *schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    isl_schedule_free(sa->schedule);
    sa->schedule = schedule;

    return isl_stat_ok;
}

/* Internal struct used for sa_candidates_smart_pick. */
struct sa_candidates_smart_pick_update_data
{
    int score;
    struct autosa_kernel *sa;
    enum autosa_dep_type dep_type;
};

/* Internal struct used for not_carrried_at_space. */
struct dep_space_test_internal_data
{
    isl_vec *dirvec;
    isl_basic_map *dep;
};

/* This function tests if the current node contains any space loop.
 * If so, test if the dependence is carried by the space loops, and update the 
 * dependence distance vector. 
 * If the dependence is carried at the space loop, return false,
 * else return true.
 */
static isl_bool not_carried_at_space(__isl_keep isl_schedule_node *node, void *user)
{
    struct dep_space_test_internal_data *data =
        (struct dep_space_test_internal_data *)user;
    isl_basic_map *dep = data->dep;
    isl_basic_map *untagged_dep = isl_basic_map_from_map(
        isl_map_factor_domain(isl_map_from_basic_map(isl_basic_map_copy(dep))));
    if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
    {
        isl_basic_map_free(untagged_dep);
        return isl_bool_true;
    }

    /* Examine if there is any space loop in the current loop band. */
    int n_dim = isl_schedule_node_band_n_member(node);
    int n_space_dim, space_dim_start;
    n_space_dim = 0;
    for (int i = 0; i < n_dim; i++)
    {
        if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space)
        {
            if (n_space_dim == 0)
                space_dim_start = i;
            n_space_dim++;
        }
    }

    if (n_space_dim > 0)
    {
        isl_vec *disvec = get_dep_dis_at_node(untagged_dep, node);
        isl_vec *dirvec = isl_vec_zero(isl_schedule_node_get_ctx(node), n_space_dim);
        int carried = 0;
        for (int i = 0; i < n_space_dim; i++)
        {
            isl_val *val = isl_vec_get_element_val(disvec, space_dim_start + i);
            dirvec = isl_vec_set_element_si(dirvec, i, isl_val_get_num_si(val));
            if (isl_val_get_num_si(val) > 0)
                carried = 1;
            isl_val_free(val);
        }
        data->dirvec = dirvec;
        isl_vec_free(disvec);
        isl_basic_map_free(untagged_dep);
        if (carried)
            return isl_bool_false;
        else
            return isl_bool_true;
    }
    isl_basic_map_free(untagged_dep);
    return isl_bool_true;
}

/* Update the score for the array. 
 * Specifically, add one credit if RAR is carried by space loops or 
 * RAW is carried by time loops.
 */
static isl_bool sa_candidates_smart_pick_update(__isl_keep isl_map *map, void *user)
{
    isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(map);
    struct sa_candidates_smart_pick_update_data *data =
        (struct sa_candidates_smart_pick_update_data *)user;
    struct autosa_kernel *sa = data->sa;
    isl_schedule_node *node = isl_schedule_get_root(sa->schedule);

    for (int i = 0; i < isl_map_n_basic_map(map); i++)
    {
        isl_basic_map *dep = isl_basic_map_list_get_basic_map(bmap_list, i);
        struct dep_space_test_internal_data internal_data = {NULL, dep};
        int is_carried_at_space = !isl_schedule_node_every_descendant(node,
                                                                      not_carried_at_space, &internal_data);
        if (is_carried_at_space && data->dep_type == AUTOSA_DEP_RAR)
            data->score += 1;
        else if (!is_carried_at_space && data->dep_type == AUTOSA_DEP_RAW)
            data->score += 1;

        isl_vec_free(internal_data.dirvec);
        isl_basic_map_free(dep);
    }
    isl_schedule_node_free(node);
    isl_basic_map_list_free(bmap_list);
    return isl_bool_true;
}

/* Select one systolic array design based on heuristics. 
 * Heuristic:
 * We favor designs with the following features:
 * - RAR carried by space loops. 
 * - RAW carried by time loops. 
 * We compute a score for each design and select the one with the highest score.
 * The score is computed as :
 * score = 1 * (RAR carried by space || RAW carried by time loop)
 * Namely, for each dependnece, if it is a RAR carried by space or a RAW carried by 
 * time loops, it will contriute one credit to the total score.
 * Besides, between 1D and 2D systolic arrays, we prefer 2D systolic arrays for now.
 */
struct autosa_kernel *sa_candidates_smart_pick(
    struct autosa_kernel **sa_list, __isl_keep isl_size num_sa)
{
    assert(num_sa > 0);
    int max_score = -1;
    struct autosa_kernel *sa_opt;
    int opt_id;
    isl_union_map *dep_rar, *dep_flow;

    for (int i = 0; i < num_sa; i++)
    {
        struct autosa_kernel *sa = sa_list[i];
        struct sa_candidates_smart_pick_update_data data;
        data.score = 0;
        data.sa = sa;
        /* Initialize the autosa_loop_types. */
        sa_loop_init(sa);
        /* Set up the space_time properties. */
        sa_space_time_loop_setup(sa);

        dep_rar = sa->scop->tagged_dep_rar;
        dep_flow = sa->scop->tagged_dep_flow;

        data.dep_type = AUTOSA_DEP_RAR;
        isl_union_map_every_map(dep_rar, &sa_candidates_smart_pick_update, &data);
        data.dep_type = AUTOSA_DEP_RAW;
        isl_union_map_every_map(dep_flow, &sa_candidates_smart_pick_update, &data);
        /* Add one more credit for 2D arrays. */
        if (sa->n_sa_dim == 2)
            data.score += 1;
        if (data.score > max_score)
        {
            opt_id = i;
            max_score = data.score;
        }        
    }

    //sa_opt = autosa_kernel_copy(sa_list[opt_id]);
    sa_opt = sa_list[opt_id];

    for (int i = 0; i < num_sa; i++) {
        if (i == opt_id)
            continue;
        else
            autosa_kernel_free(sa_list[i]);
    }
    free(sa_list);

    return sa_opt;
}

/* Return the selected systolic array design and free the rest. */
struct autosa_kernel *sa_candidates_manual_pick(struct autosa_kernel **sa_list,
                                                isl_size num_sa, int sa_id)
{
    struct autosa_kernel *sa_opt = sa_list[sa_id];

    for (int i = 0; i < num_sa; i++) {        
        if (sa_id == i)
            continue;
        else
            autosa_kernel_free(sa_list[i]);
    }
    free(sa_list);

    return sa_opt;
}

/* Create the array of autosa_local_array_info structures "array"
 * inside "kernel". The number of elements in this array is 
 * the same as the number of arrays in "prog".
 * Initialize the "array" field of each local array to point 
 * to the corresponding array in "prog".
 */
static struct autosa_kernel *autosa_kernel_create_local_arrays(
    struct autosa_kernel *kernel, struct autosa_prog *prog)
{
    int i;
    isl_ctx *ctx;

    if (!kernel)
        return NULL;

    ctx = isl_set_get_ctx(prog->context);
    //kernel->array = isl_calloc_array(ctx,
    //                                 struct autosa_local_array_info, prog->n_array);
    /* Initialize local_array_info */
    kernel->array = new autosa_local_array_info[prog->n_array];
    if (!kernel->array)
        return (struct autosa_kernel *)autosa_kernel_free(kernel);
    kernel->n_array = prog->n_array;

    for (i = 0; i < prog->n_array; i++)
    {
        kernel->array[i].array = &prog->array[i];
        prog->array[i].local_array = &kernel->array[i];
        /* Initialize the fields. */
        kernel->array[i].n_io_group_refs = 0;
        kernel->array[i].n_mem_ports = 0;
        kernel->array[i].host_serialize = 0;
        kernel->array[i].serialize_bound = NULL;
        /* Initiaze the sparse information */
        kernel->array[i].is_sparse = 0;
        kernel->array[i].vec_len = 0;
        kernel->array[i].n_nzero = 0;
        kernel->array[i].compress_ratio = 0.0f;
        kernel->array[i].n_meta_data = 0;
        kernel->array[i].eff_compress_ratio = 0.0f;
        kernel->array[i].global = 0;
    }

    return kernel;
}

/* Internal data struct used for sa_io_update. */
struct data_transfer_opt_data
{
    struct autosa_stmt_access *access;
    struct autosa_kernel *kernel;
    enum autosa_dep_type dep_type;
    isl_bool is_update;
};

/* If dependence is carried by the space loop, then mark it with the access 
 * as exterior I/O; otherwise, mark it as the interior I/O.
 * In addition, update the dependence vector.
 */
isl_stat data_transfer_update(__isl_keep isl_basic_map *dep, struct data_transfer_opt_data *data)
{
    struct autosa_stmt_access *access = data->access;
    struct autosa_kernel *kernel = data->kernel;
    isl_id *src_id, *dest_id;
    isl_space *space;
    isl_space *src_space, *dest_space;
    isl_schedule_node *node;

    /* Test if the access is associated with the current dep. */
    space = isl_basic_map_get_space(dep);
    src_space = isl_space_unwrap(isl_space_domain(isl_space_copy(space)));
    dest_space = isl_space_unwrap(isl_space_range(space));
    src_id = isl_space_get_tuple_id(src_space, isl_dim_out);
    dest_id = isl_space_get_tuple_id(dest_space, isl_dim_out);
    isl_space_free(src_space);
    isl_space_free(dest_space);

    if (src_id != access->ref_id && dest_id != access->ref_id)
    {
        isl_id_free(src_id);
        isl_id_free(dest_id);
        return isl_stat_ok;
    }
    isl_id_free(src_id);
    isl_id_free(dest_id);

    /* Test if the dependence is carried at the space loop. */
    struct dep_space_test_internal_data internal_data = {NULL, dep};
    node = isl_schedule_get_root(kernel->schedule);
    int is_carried_at_space = !isl_schedule_node_every_descendant(
        node, not_carried_at_space, &internal_data);
    if (is_carried_at_space)
    {
        access->io_info = (struct autosa_io_info **)realloc(
            access->io_info, sizeof(struct autosa_io_info *) * (++access->n_io_info));
        access->io_info[access->n_io_info - 1] =
            (struct autosa_io_info *)malloc(sizeof(struct autosa_io_info));
        access->io_info[access->n_io_info - 1]->io_type = AUTOSA_EXT_IO;
        access->io_info[access->n_io_info - 1]->dep =
            (struct autosa_dep *)calloc(1, sizeof(struct autosa_dep));
        access->io_info[access->n_io_info - 1]->dep->isl_dep = isl_basic_map_copy(dep);
        access->io_info[access->n_io_info - 1]->dep->type = data->dep_type;
        access->io_info[access->n_io_info - 1]->dir = internal_data.dirvec;
        access->io_info[access->n_io_info - 1]->old_dir = isl_vec_dup(internal_data.dirvec);        
    }
    else
    {
        access->io_info = (struct autosa_io_info **)realloc(
            access->io_info, sizeof(struct autosa_io_info *) * (++access->n_io_info));
        access->io_info[access->n_io_info - 1] =
            (struct autosa_io_info *)malloc(sizeof(struct autosa_io_info));
        access->io_info[access->n_io_info - 1]->io_type = AUTOSA_INT_IO;
        access->io_info[access->n_io_info - 1]->dep =
            (struct autosa_dep *)calloc(1, sizeof(struct autosa_dep));
        access->io_info[access->n_io_info - 1]->dep->isl_dep = isl_basic_map_copy(dep);
        access->io_info[access->n_io_info - 1]->dep->type = data->dep_type;
        access->io_info[access->n_io_info - 1]->dir = internal_data.dirvec;
        access->io_info[access->n_io_info - 1]->old_dir = isl_vec_dup(internal_data.dirvec);        
    }

    isl_schedule_node_free(node);
    data->is_update = isl_bool_true;

    return isl_stat_ok;
}

/* Examine each dependence as basic maps in the "map".
 */
static isl_bool data_transfer_update_wrap(__isl_keep isl_map *map, void *user)
{
    isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(map);
    for (int i = 0; i < isl_map_n_basic_map(map); i++)
    {
        isl_basic_map *dep = isl_basic_map_list_get_basic_map(bmap_list, i);
        struct data_transfer_opt_data *opt_data = (struct data_transfer_opt_data *)user;
        data_transfer_update(dep, opt_data);
        isl_basic_map_free(dep);
    }
    isl_basic_map_list_free(bmap_list);
    return isl_bool_true;
}

/* This function extracts the communication pairs from the kernel.
 * Each access is paired with the dependence it is associated with.
 * We consider three types of deps: RAR, RAW, WAW.
 * For each comm pair <access, dep>, we update two properties:
 * - I/O type: exterior I/O or interior I/O.
 * - I/O direction: the dependence vector on the space loops.
 */
static isl_stat sa_io_update(struct autosa_kernel *sa)
{
    struct autosa_local_array_info *local_array;
    /* Initialize the IO info */
    for (int i = 0; i < sa->n_array; i++)
    {
        local_array = &sa->array[i];
        for (int j = 0; j < sa->array[i].array->n_ref; j++)
        {
            struct autosa_stmt_access *access = sa->array[i].array->refs[j];
            access->n_io_info = 0;
            access->io_info = NULL;
        }
        local_array->n_lane = 0;
        local_array->array->n_lane = 0;
    }

    /* Update the IO information */
    for (int i = 0; i < sa->n_array; i++)
    {
        local_array = &sa->array[i];
        local_array->array_type = AUTOSA_UNKNOWN_ARRAY;
        for (int j = 0; j < local_array->array->n_ref; j++)
        {
            struct autosa_stmt_access *access = local_array->array->refs[j];
            isl_union_map *dep_rar = sa->scop->tagged_dep_rar;
            isl_union_map *dep_flow = sa->scop->tagged_dep_flow;
            isl_union_map *dep_waw = sa->scop->tagged_dep_waw;
            struct data_transfer_opt_data opt_data =
                {access, sa, AUTOSA_DEP_UNKNOWN, isl_bool_false};

            opt_data.dep_type = AUTOSA_DEP_RAR;
            isl_union_map_every_map(dep_rar, &data_transfer_update_wrap, &opt_data);
            if (opt_data.is_update == isl_bool_true)
            {
                local_array->array_type = AUTOSA_EXT_ARRAY;
                opt_data.is_update = isl_bool_false;
            }
            opt_data.dep_type = AUTOSA_DEP_RAW;
            isl_union_map_every_map(dep_flow, &data_transfer_update_wrap, &opt_data);
            if (opt_data.is_update == isl_bool_true)
            {
                local_array->array_type = AUTOSA_INT_ARRAY;
                opt_data.is_update = isl_bool_false;
            }
            opt_data.dep_type = AUTOSA_DEP_WAW;
            isl_union_map_every_map(dep_waw, &data_transfer_update_wrap, &opt_data);
        }
    }

    return isl_stat_ok;
}

void extract_sa_dims_from_node(__isl_keep isl_schedule_node *node, int *sa_dims, int n_sa_dim)
{
    int *ubs;
    ubs = extract_band_upper_bounds(node);
    for (int i = 0; i < n_sa_dim; i++) {
        sa_dims[i] = ubs[i];
    }
    free(ubs);    
}

/* Apply array partitioning.
 * Apply loop tiling on the band that contains the space loops.
 * In addition, if L2 array partitioning is abled, we will tile the tile loops
 * from the previous array partitioning again to generate two-level tiling.
 * TODO: Reorganize the array partitioning loops and place them following the
 * ascending order of the dependence distances. 
 * 
 * en: enable signal for array partitioning.
 * mode: opt mode for array partitioning.
 * L2_en: enable signal for L2 array partitioning.
 * L2_mode: opt mode for L2 array partitioning.
 */
isl_stat sa_array_partitioning_optimize(struct autosa_kernel *sa,
                                        bool en, char *mode, bool L2_en, char *L2_mode)
{
    int tile_len;
    isl_schedule *schedule;
    int *tile_size;
    isl_id *id;

    /* Fetch the band that contains the space loops. */
    isl_schedule_node *node;
    if (sa->type == AUTOSA_SA_TYPE_SYNC)
    {
        node = get_innermost_permutable_node(sa->schedule);
    }
    else if (sa->type == AUTOSA_SA_TYPE_ASYNC)
    {
        node = get_outermost_permutable_node(sa->schedule);
    }
    else
    {
        isl_die(sa->ctx, isl_error_invalid,
                "systolic array type not supported", return isl_stat_error);
    }

    if (!en)
    {
        /* Array partitioning is disabled, we will simply add an "array" mark before
         * the space band and return.
         */
        id = isl_id_alloc(sa->ctx, "array", NULL);
        node = isl_schedule_node_insert_mark(node, id);

        isl_schedule_free(sa->schedule);
        sa->schedule = isl_schedule_node_get_schedule(node);
        isl_schedule_node_free(node);
        return isl_stat_ok;
    }

    printf("[AutoSA] Apply array partitioning.\n");

    /* Mark the loop properties. */
    for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
    {
        node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_array_part);
    }
    schedule = isl_schedule_node_get_schedule(node);

    if (sa->scop->options->autosa->verbose)
    {
        /* Display the candidate loops. */
        isl_printer *p = isl_printer_to_file(sa->ctx, stdout);
        p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
        p = isl_printer_print_schedule(p, schedule);
        printf("\n");
        isl_printer_free(p);
    }
    isl_schedule_free(schedule);

    tile_len = isl_schedule_node_band_n_member(node);
    if (sa->scop->options->autosa->tuning_method == 1) {
        /* Select one tiling factor in between (1, ub)/
         * Avoid 1 as such as tiling factor will eliminate the opt chances for the 
         * later stages. 
         * Avoid ub as it will generate loop with single iteration that will be eliminated.
         */
        tile_size = extract_band_upper_bounds(node);
        for (int i = 0; i < tile_len; i++) {
            int size = tile_size[i];
            std::vector<int> factors = get_factors(size);
            if (factors.size() < 3) {
                printf("[AutoSA] Error: Cannot find legal tiling factors for auto-tuning template!\n");
                exit(1);
            }
            tile_size[i] = factors[factors.size() - 2];
        }
    } else {
        if (!strcmp(mode, "manual"))
        {
            /* Manual mode */
            tile_size = read_array_part_tile_sizes(sa, tile_len);
            if (!tile_size)
            {
                /* User hasn't specified the tiling factors for array partitioning yet,
                 * we will dump out the number and upper bounds of array_part loops 
                 * and exit the program. */
                int *ubs = extract_band_upper_bounds(node);
                FILE *fp;
                char *content;
                cJSON *tuning, *array_part_json, *loops_json, *n_sa_dim_json;
                isl_printer *p_str;
                char *tuning_path;

                tuning = cJSON_CreateObject();
                array_part_json = cJSON_CreateObject();
                cJSON_AddItemToObject(tuning, "array_part", array_part_json);
                loops_json = cJSON_CreateArray();
                cJSON_AddItemToObject(array_part_json, "tilable_loops", loops_json);
                for (int i = 0; i < tile_len; i++)
                {
                    cJSON *loop = cJSON_CreateNumber(ubs[i]);
                    cJSON_AddItemToArray(loops_json, loop);
                }
                /* Add the sa_dim */
                n_sa_dim_json = cJSON_CreateNumber(sa->n_sa_dim);
                cJSON_AddItemToObject(array_part_json, "n_sa_dim", n_sa_dim_json);
                p_str = isl_printer_to_str(sa->ctx);
                p_str = isl_printer_print_str(p_str, sa->options->autosa->output_dir);
                p_str = isl_printer_print_str(p_str, "/tuning.json");
                tuning_path = isl_printer_get_str(p_str);
                fp = fopen(tuning_path, "w");
                content = cJSON_Print(tuning);
                fprintf(fp, "%s", content);
                cJSON_Delete(tuning);
                isl_printer_free(p_str);
                free(tuning_path);
                exit(0);
            }
        }
        else
        {
            /* Auto mode.
             * Perform the array partitioning following the default policy. */
            tile_size = read_default_array_part_tile_sizes(sa, tile_len);
        }
    }

    /* Tile the band. */
    if (!tile_size)
    {
        isl_schedule_node_free(node);
        return isl_stat_error;
    }    
    /* Examine if all tiling factors are -1, in that case, we will skip array 
     * partitioning. 
     */
    int c;
    for (c = 0; c < tile_len; c++) {
        if (tile_size[c] != -1)
            break;
    }
    if (c == tile_len) {
        id = isl_id_alloc(sa->ctx, "array", NULL);
        node = isl_schedule_node_insert_mark(node, id);
        node = isl_schedule_node_child(node, 0);
        extract_sa_dims_from_node(node, sa->sa_dim, sa->n_sa_dim);        

        free(tile_size);
        isl_schedule_free(sa->schedule);
        sa->schedule = isl_schedule_node_get_schedule(node);
        isl_schedule_node_free(node);
        return isl_stat_ok;
    }
    /* For now, our codegen doesn't support arrays with size one at any dim. 
     * We will examine if array size is one at any dimension, and return if found. 
     */
    for (int i = 0; i < sa->n_sa_dim; i++)
    {
        if ((sa->type == AUTOSA_SA_TYPE_SYNC && tile_size[tile_len - sa->n_sa_dim + i] == 1) ||
           (sa->type == AUTOSA_SA_TYPE_ASYNC && tile_size[i] == 1)) {            
            printf("[AutoSA] Tiling factor 1 for array partitioning is not supported. Array partitioning is skipped.\n");
            /* Skip the array partition. */
            id = isl_id_alloc(sa->ctx, "array", NULL);
            node = isl_schedule_node_insert_mark(node, id);
            node = isl_schedule_node_child(node, 0);
            extract_sa_dims_from_node(node, sa->sa_dim, sa->n_sa_dim);

            free(tile_size);
            isl_schedule_free(sa->schedule);
            sa->schedule = isl_schedule_node_get_schedule(node);
            isl_schedule_node_free(node);
            return isl_stat_ok;
        }
    }
        
    sa->array_part_w = tile_len;
    node = autosa_tile_band(node, tile_size);
    if (sa->scop->options->autosa->tuning_method == 1)
        node = sa->tuning_program->tile(node, 0, "array_part");

    free(tile_size);
    node = isl_schedule_node_child(node, 0);
    extract_sa_dims_from_node(node, sa->sa_dim, sa->n_sa_dim);    
    node = isl_schedule_node_parent(node);

    /* Reorder the array part loops based on the dependence distance. */    
    node = reorder_band_by_dep_dis(node);

    /* Add the array marker */
    node = isl_schedule_node_child(node, 0);
    id = isl_id_alloc(sa->ctx, "array", NULL);
    node = isl_schedule_node_insert_mark(node, id);
    node = isl_schedule_node_parent(node);

    /* Examine if there is any flow dep carried in the array_part band. 
     * For this case, we need to implement a credit-based dependence queue to 
     * force the possible data dependence between two array partitions. 
     * TODO: implement this feature. 
     */
    //if (!sa->options->autosa->credit_control)
    //{
    //    for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
    //    {
    //        if (!isl_schedule_node_band_member_get_coincident(node, i))
    //        {
    //            printf("[AutoSA] Warning: Flow deps carried in the array partitioning band.\n");
    //            printf("[AutoSA] Warning: Using simple task pipelining could lead to potential data hazards.\n");
    //            printf("[AutoSA] Warning: The program will proceed as usual. You could consider enabling credit control.\n");
    //            break;
    //        }
    //    }
    //}
    //else
    //{
    //    printf("[AutoSA] Error: Credit control is not supported yet!\n");
    //    exit(1);
    //    // TODO: modify the schedule to add credit rd/wr for I/O modules
    //    // TODO: modify the module decls and fifo decls for credit fifos
    //    // TODO: disable double buffering.
    //    //    /* Disable double-buffering */
    //    //    sa->options->autosa->double_buffer = 0;
    //}

    /* If two-level buffering is enabled, we will need to apply a second-level tiling
   * on the tile band from the previous array partitioning. 
   * Namely, after array partitioning, we get two bands:
   * T
   * |
   * P
   * To support two-level buffering, we will tile the band T again:
   * T1
   * |
   * T2
   * |
   * P
   */
    if (sa->options->autosa->two_level_buffer)
    {
        if (L2_en)
        {
            /* Tile the band again */
            printf("[AutoSA] Two-level buffering is set. Apply second-level array partitioning.\n");
            tile_len = isl_schedule_node_band_n_member(node);
            if (!strcmp(mode, "manual"))
            {
                tile_size = read_array_part_L2_tile_sizes(sa, tile_len);
                if (!tile_size)
                {
                    /* Dump out the number of and upper bounds of array_part loops and exit the program. */
                    int *ubs = extract_band_upper_bounds(node);
                    int *loop_coincident = (int *)malloc(sizeof(int) * tile_len);
                    FILE *fp;
                    char *content;
                    cJSON *tuning, *array_part_json, *loops_json;
                    isl_printer *p_str;
                    char *tuning_path;

                    for (int i = 0; i < tile_len; i++)
                    {
                        loop_coincident[i] = isl_schedule_node_band_member_get_coincident(node, i);
                    }

                    tuning = cJSON_CreateObject();
                    array_part_json = cJSON_CreateObject();
                    cJSON_AddItemToObject(tuning, "array_part_L2", array_part_json);
                    loops_json = cJSON_CreateArray();
                    cJSON_AddItemToObject(array_part_json, "tilable_loops", loops_json);
                    for (int i = 0; i < tile_len; i++)
                    {
                        cJSON *loop = cJSON_CreateNumber(ubs[i]);
                        cJSON_AddItemToArray(loops_json, loop);
                    }
                    loops_json = cJSON_CreateArray();
                    cJSON_AddItemToObject(array_part_json, "coincident", loops_json);
                    for (int i = 0; i < tile_len; i++)
                    {
                        cJSON *loop = cJSON_CreateNumber(loop_coincident[i]);
                        cJSON_AddItemToArray(loops_json, loop);
                    }
                    p_str = isl_printer_to_str(sa->ctx);
                    p_str = isl_printer_print_str(p_str, sa->options->autosa->output_dir);
                    p_str = isl_printer_print_str(p_str, "/tuning.json");
                    tuning_path = isl_printer_get_str(p_str);
                    fp = fopen(tuning_path, "w");
                    content = cJSON_Print(tuning);
                    fprintf(fp, "%s", content);
                    cJSON_Delete(tuning);
                    free(tuning_path);
                    free(loop_coincident);
                    isl_printer_free(p_str);
                    free(ubs);
                    exit(0);
                }
            }
            else
            {
                /* Perform second-level array partitioning following the default policy. */
                // tile_size = read_default_array_part_L2_tile_sizes(sa, tile_len);
                int *ubs = extract_band_upper_bounds(node);
                tile_size = isl_alloc_array(sa->ctx, int, tile_len);
                for (int i = 0; i < tile_len; i++)
                {
                    tile_size[i] = ubs[i];
                }
                free(ubs);
            }

            if (!tile_size)
            {
                isl_schedule_node_free(node);
                return isl_stat_error;
            }
            node = autosa_tile_band(node, tile_size);
            free(tile_size);

            /* Add the second-level array mark */
            node = isl_schedule_node_child(node, 0);
            id = isl_id_alloc(sa->ctx, "array_L2", NULL);
            node = isl_schedule_node_insert_mark(node, id);
            node = isl_schedule_node_parent(node);
        }
        else
        {
            /* Disable the L2 array partitioning */
            sa->options->autosa->two_level_buffer = 0;
        }
    }

    /* Clean up the band pe_opt properties. */
    schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &clear_pe_opt_prop, NULL);

    isl_schedule_free(sa->schedule);
    sa->schedule = schedule;

    return isl_stat_ok;
}

/* Insert an "hls_pipeline" mark under the last time loop */
static __isl_give isl_schedule_node *add_hls_pipeline(
    __isl_take isl_schedule_node *node, void *user)
{
    struct autosa_kernel *sa = (struct autosa_kernel *)user;
    isl_ctx *ctx = sa->ctx;

    if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
        return node;

    /* Examine if the node is innermost */
    node = isl_schedule_node_child(node, 0);
    isl_bool no_inner_band = isl_schedule_node_every_descendant(node,
                                                                &no_permutable_node, NULL);
    node = isl_schedule_node_parent(node);
    if (!no_inner_band)
        return node;

    int n = isl_schedule_node_band_n_member(node);

    if (sa->type == AUTOSA_SA_TYPE_ASYNC)
    {
        if (isl_schedule_node_band_member_get_space_time(node, n - 1) == autosa_loop_time)
        {
            isl_id *id;
            id = isl_id_alloc(ctx, "hls_pipeline", NULL);
            node = isl_schedule_node_child(node, 0);
            node = isl_schedule_node_insert_mark(node, id);
            node = isl_schedule_node_parent(node);
        }
    }
    else if (sa->type == AUTOSA_SA_TYPE_SYNC)
    {
        /* Go to the innermost band with time loops. */
        if (isl_schedule_node_band_member_get_space_time(node, 0) != autosa_loop_time)
        {
            node = isl_schedule_node_parent(node);
            while (isl_schedule_node_get_type(node) != isl_schedule_node_band &&
                   isl_schedule_node_has_parent(node))
            {
                node = isl_schedule_node_parent(node);
            }
        }
        if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
        {
            n = isl_schedule_node_band_n_member(node);
            for (int i = n - 1; i >= 0; i--)
            {
                if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_time)
                {
                    isl_id *id = isl_id_alloc(ctx, "hls_pipeline", NULL);
                    if (i != n - 1)
                    {
                        node = isl_schedule_node_band_split(node, i + 1);
                    }
                    node = isl_schedule_node_child(node, 0);
                    node = isl_schedule_node_insert_mark(node, id);
                    node = isl_schedule_node_parent(node);
                    break;
                }
            }
        }
    }

    return node;
}

/* Internal struct used for latency_opt_check */
struct latency_opt_check_data
{
    struct autosa_kernel *kernel;
    int is_required;
};

/* Check if the innermost time loop is parallel.
 * If this loop is parallel, it can be used for latency hiding and 
 * there is no need for further optimization.
 * We will split off this loop from the band, and attach a "latency"
 * marker above it.
 */
static __isl_give isl_schedule_node *latency_opt_check(
    __isl_take isl_schedule_node *node, void *user)
{
    struct latency_opt_check_data *data = (struct latency_opt_check_data *)user;
    struct autosa_kernel *sa = data->kernel;
    isl_ctx *ctx = sa->ctx;

    if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
        return node;

    /* Examine if the node is innermost */
    node = isl_schedule_node_child(node, 0);
    isl_bool no_inner_band = isl_schedule_node_every_descendant(node,
                                                                &no_permutable_node, NULL);
    node = isl_schedule_node_parent(node);
    if (!no_inner_band)
        return node;

    int n = isl_schedule_node_band_n_member(node);

    if (sa->type == AUTOSA_SA_TYPE_ASYNC)
    {
        if (isl_schedule_node_band_member_get_coincident(node, n - 1) &&
            isl_schedule_node_band_member_get_space_time(node, n - 1) == autosa_loop_time)
        {
            //isl_id *id;
            data->is_required = 0;
            ///* Split off the loop and attach a "latency" mark */
            //if (n > 1)
            //{
            //    node = isl_schedule_node_band_split(node, n - 1);
            //    node = isl_schedule_node_child(node, 0);
            //}
            //id = isl_id_alloc(ctx, "latency", NULL);
            //node = isl_schedule_node_insert_mark(node, id);
            //node = isl_schedule_node_parent(node);
        }
    }
    else if (sa->type == AUTOSA_SA_TYPE_SYNC)
    {        
        if (isl_schedule_node_band_member_get_space_time(node, 0) != autosa_loop_time)
        {
            node = isl_schedule_node_parent(node);
            while (isl_schedule_node_get_type(node) != isl_schedule_node_band &&
                   isl_schedule_node_has_parent(node))
            {
                node = isl_schedule_node_parent(node);
            }
        }
        if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
        {
            n = isl_schedule_node_band_n_member(node);
            for (int i = n - 1; i >= 0; i--)
            {
                if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_time)
                {
                    if (isl_schedule_node_band_member_get_coincident(node, i))
                    {
                        //isl_id *id;
                        data->is_required = 0;
                        ///* Split off the time loop */
                        //if (i > 1)
                        //{
                        //    node = isl_schedule_node_band_split(node, i);
                        //    node = isl_schedule_node_child(node, 0);
                        //}
                        //if (n - i - 1 > 0)
                        //{
                        //    node = isl_schedule_node_band_split(node, 1);
                        //}
                        //id = isl_id_alloc(ctx, "latency", NULL);
                        //node = isl_schedule_node_insert_mark(node, id);
                        //node = isl_schedule_node_parent(node);
                    }
                    break;
                }
            }
        }
    }

    return node;
}

/* Mark parallel loop as latency_hiding candidate loop. 
 */
static isl_schedule_node *detect_latency_hiding_loop(__isl_take isl_schedule_node *node, void *user)
{
    struct autosa_kernel *sa = (struct autosa_kernel *)user;

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
        {
            if (isl_schedule_node_band_member_get_coincident(node, i))
            {
                node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_latency);
            }
        }
    }

    return node;
}

/* Examine if the node is the last band node.
 * If so, add a "latency" mark before the node. 
 */
static __isl_give isl_schedule_node *add_latency_mark(
    __isl_take isl_schedule_node *node, void *user)
{
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        node = isl_schedule_node_child(node, 0);
        isl_bool no_inner_band = isl_schedule_node_every_descendant(node,
                                                                    &no_permutable_node, NULL);
        node = isl_schedule_node_parent(node);
        if (no_inner_band)
        {
            /* Insert the "latency" mark. */
            isl_id *id = isl_id_alloc(isl_schedule_node_get_ctx(node), "latency", NULL);
            node = isl_schedule_node_insert_mark(node, id);
        }
    }

    return node;
}

/* Sink the current node (latency hiding loop) as the last time loop. 
 * If the array is async, then sink the node to the bottom.
 * If the array is sync, then lift it up and insert it as the last loop 
 * in the time band.
 */
__isl_give isl_schedule_node *autosa_latency_node_band_sink_time(
    __isl_take isl_schedule_node *node, struct autosa_kernel *sa)
{
    if (sa->type == AUTOSA_SA_TYPE_ASYNC)
    {
        if (sa->options->autosa->isl_sink) {
            node = isl_schedule_node_band_sink(node);
            /* Add the "latency" mark. */
            node = isl_schedule_node_map_descendant_bottom_up(
                node, &add_latency_mark, NULL);

        } 
        else {         
            node = autosa_node_sink_to_mark(node, "latency");            
        }
    }
    else if (sa->type == AUTOSA_SA_TYPE_SYNC)
    {
        /* Move up to the node that contains the space loop.
         * The current node should be right below the space band.
         */
        node = isl_schedule_node_parent(node);

        /* Find the position of the first space loop. */
        int n_member = isl_schedule_node_band_n_member(node);
        int space_pos;
        for (int i = 0; i < n_member; i++)
        {
            if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space)
            {
                space_pos = i;
                break;
            }
        }
        if (space_pos == 0)
        {
            /* Interchange the current node with the child node. */
            node = autosa_node_interchange(node);
            /* Insert the "latency" mark. */
            isl_id *id = isl_id_alloc(sa->ctx, "latency", NULL);
            node = isl_schedule_node_insert_mark(node, id);
            node = isl_schedule_node_child(node, 0);
            node = isl_schedule_node_child(node, 0);
        }
        else
        {
            node = isl_schedule_node_band_split(node, space_pos);
            node = isl_schedule_node_child(node, 0);
            /* Interchange the current node with the child node. */
            node = autosa_node_interchange(node);
            /* Insert the "latency" mark. */
            isl_id *id = isl_id_alloc(sa->ctx, "latency", NULL);
            node = isl_schedule_node_insert_mark(node, id);
            node = isl_schedule_node_child(node, 0);
            node = isl_schedule_node_child(node, 0);
        }
    }

    return node;
}

/* Given each node band, tile the candidate loop and permute it innermost in the time
 * loop band. 
 * If the tile size is no greater than 1, the candidate loop is skipped.
 * For each point loop, a "latency" mark is added.
 */
static __isl_give isl_schedule_node *autosa_latency_tile_band_loop(
    __isl_take isl_schedule_node *node, void *user)
{
    struct autosa_pe_opt_tile_data *data = (struct autosa_pe_opt_tile_data *)user;
    if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
        return node;

    int n;
    isl_id *id;
    n = isl_schedule_node_band_n_member(node);
    int i;
    int reverse_visit = 0;
    
    if (data->sa->options->autosa->reverse_order) {        
        if (data->sa->options->autosa->isl_sink) {
            i = n - 1;
            reverse_visit = 1;            
        } else {
            i = 0;
            reverse_visit = 0;    
        }
    } else {
        if (data->sa->options->autosa->isl_sink) {            
            i = 0;
            reverse_visit = 0;   
        } else {            
            i = n - 1;
            reverse_visit = 1;            
        }
    }

    while (1)
    {        
        if (isl_schedule_node_band_member_get_pe_opt(node, i) == autosa_loop_latency)
        {
            int loop_tile_size;            
            loop_tile_size = data->tile_size[data->n_touched_loop];
            (data->n_touched_loop)++;
            /* If latency hiding is applied on the space loops, we need to update
             * the SA dimensions. 
             */
            if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space)
            {
                /* Figure out the dim position. */
                int touched_space_loop = 0;
                for (int j = 0; j < i; j++)
                {
                    if (isl_schedule_node_band_member_get_space_time(node, j) == autosa_loop_space)
                        touched_space_loop++;
                }
                //std::cout << "space: " << data->sa->sa_dim[touched_space_loop] << ", " << loop_tile_size << std::endl;
                data->sa->sa_dim[touched_space_loop] /= loop_tile_size;                
                if (data->sa->sa_dim[touched_space_loop] == 1) {
                    throw std::runtime_error("[AutoSA] Error: Array dimension as 1 is not supported!");
                }
            }

            /* Skip loop tile size as 1 */
            if (loop_tile_size > 1)
            {                
                /* Tile the current loop and permute it to be the innermost time loop.
                 * Specifically, tile the loop in the band at "i"th position with the 
                 * size "loop_tile_size".
                 * The returned node points at the tile loop. */
                node = autosa_node_band_tile_loop(node, loop_tile_size, i);                
                /* Reset the candidate loop in the tile loop the pe_opt property to default. */
                node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_default);
                /* Reset the point loop space_time property to time loop. */
                node = isl_schedule_node_child(node, 0);
                node = isl_schedule_node_band_member_set_space_time(node, 0, autosa_loop_time);                
                /* Reset the point loop pe_opt property to default .*/
                node = isl_schedule_node_band_member_set_pe_opt(node, 0, autosa_loop_default);
                if (data->sa->scop->options->autosa->tuning_method == 1) {
                    node = isl_schedule_node_parent(node);
                    node = data->sa->tuning_program->tile(node, i, 1, "latency", {}, -1);
                    node = isl_schedule_node_child(node, 0);
                }
                /* Move the single loop node to the bottom of the time band. */
                node = autosa_latency_node_band_sink_time(node, data->sa);                
                (data->n_tiled_loop)++;
                return node;
            }
            else
            {
                /* Reset the pe_opt property */
                node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_default);
            }
        }
        if (reverse_visit) {
            if (i == 0)
                break;
            i--;
        } else {
            if (i == n - 1)
                break;
            i++;
        }
    }

    return node;
}

/* Internal struct for count_latency_hiding_loop. */
struct count_latency_hiding_loop_data
{
    int tile_len;
    int *ubs;
    struct autosa_kernel *kernel;
};

/* Count the number of latency hiding candidate loops.
 * Extract the loop upper bounds of the candidate loops.
 */
//static isl_bool count_latency_hiding_loop(
//    __isl_keep isl_schedule_node *node, void *user)
//{
//    struct count_latency_hiding_loop_data *data =
//        (struct count_latency_hiding_loop_data *)user;
//    isl_schedule_node *node_copy;
//
//    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
//    {
//        int n = isl_schedule_node_band_n_member(node);
//        for (int i = 0; i < n; i++)
//        {
//            if (isl_schedule_node_band_member_get_pe_opt(node, i) == autosa_loop_latency)
//            {
//                data->tile_len = data->tile_len + 1;
//                /* Extract the loop upper bound */
//                node_copy = isl_schedule_node_copy(node);
//                if (i > 0)
//                {
//                    node_copy = isl_schedule_node_band_split(node_copy, i);
//                    node_copy = isl_schedule_node_child(node_copy, 0);
//                }
//                if (n - i - 1 > 0)
//                {
//                    node_copy = isl_schedule_node_band_split(node_copy, 1);
//                }
//                int *ubs = extract_band_upper_bounds(node_copy);
//                data->ubs = (int *)realloc(data->ubs, sizeof(int) * data->tile_len);
//                data->ubs[data->tile_len - 1] = ubs[0];
//                isl_schedule_node_free(node_copy);
//                free(ubs);
//            }
//        }
//    }
//
//    return isl_bool_true;
//}

static __isl_give isl_schedule_node *count_latency_hiding_loop(
    __isl_take isl_schedule_node *node, void *user)
{
    struct count_latency_hiding_loop_data *data =
        (struct count_latency_hiding_loop_data *)user;    
    if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
        return node;
    
    int n = isl_schedule_node_band_n_member(node);
    int i;
    int reverse_visit = 0;
    if ((data->kernel->options->autosa->reverse_order && !data->kernel->options->autosa->isl_sink) ||
       (!data->kernel->options->autosa->reverse_order && data->kernel->options->autosa->isl_sink)) {
        i = 0;
        reverse_visit = 0;
    } else {
        i = n - 1;
        reverse_visit = 1;
    }
    while (1) {
        if (isl_schedule_node_band_member_get_pe_opt(node, i) == autosa_loop_latency) {
            data->tile_len = data->tile_len + 1;
            /* Extract the loop upper bound */
            isl_schedule_node *node_copy = isl_schedule_node_copy(node);
            if (i > 0)
            {
                node_copy = isl_schedule_node_band_split(node_copy, i);
                node_copy = isl_schedule_node_child(node_copy, 0);
            }
            if (n - i - 1 > 0)
            {
                node_copy = isl_schedule_node_band_split(node_copy, 1);
            }
            int *ubs = extract_band_upper_bounds(node_copy);
            data->ubs = (int *)realloc(data->ubs, sizeof(int) * data->tile_len);
            data->ubs[data->tile_len - 1] = ubs[0];
            isl_schedule_node_free(node_copy);
            free(ubs);            
        }        
        if (reverse_visit) {
            if (i == 0)
                break;
            i--;
        } else {
            if (i == n - 1)
                break;
            i++;
        }
    }

    return node;
}

/* Perform the latency hiding in either "Manual" or "Auto" mode.
 * We will tile each loop with a tiling factor greater than one, and place
 * the point loop as the innermost time loop. 
 * A "latency" mark is placed before this loop.
 * A "hls_pipeline" mark is placed under this loop.
 */
static __isl_give isl_schedule_node *autosa_latency_tile_loop(
    __isl_take isl_schedule_node *node, struct autosa_kernel *sa, char *mode)
{
    int tile_len;
    int *tile_size;
    struct count_latency_hiding_loop_data data;
    data.tile_len = 0;
    data.ubs = NULL;
    data.kernel = sa;
    int i;

    /* Count the candidate loop number and extract the loop upper bounds. */
    //isl_schedule_node_foreach_descendant_top_down(
    //    node, &count_latency_hiding_loop, &data);
    node = isl_schedule_node_map_descendant_bottom_up(node, &count_latency_hiding_loop, &data);
    tile_len = data.tile_len;

    if (sa->scop->options->autosa->tuning_method == 1) {
        /* Select one tiling factor in between (1, ub).
         * Avoid 1 as such a tiling factor will be skipped and the AST loop will 
         * be degenerated.
         * Avoid ub as generating space dim with 1 is not supported. 
         */        
        tile_size = data.ubs;
        for (int i = 0; i < tile_len; i++) {
            int size = tile_size[i];
            std::vector<int> factors = get_factors(size);
            if (factors.size() < 3) {
                printf("[AutoSA] Error: Cannot find legal tiling factors for auto-tuning template!\n");
                exit(1);
            }
            tile_size[i] = factors[1];
        }
    } else {
        if (!strcmp(mode, "manual"))
        {
            tile_size = read_latency_tile_sizes(sa, tile_len);
            if (!tile_size)
            {
                /* Dump out the number and upper bounds of latency loops and exit the program. */
                int *ubs = data.ubs;
                FILE *fp;
                char *content;
                cJSON *tuning, *latency_json, *loops_json;
                char *tuning_path;
                isl_printer *p_str;

                tuning = cJSON_CreateObject();
                latency_json = cJSON_CreateObject();
                cJSON_AddItemToObject(tuning, "latency", latency_json);
                loops_json = cJSON_CreateArray();
                cJSON_AddItemToObject(latency_json, "tilable_loops", loops_json);
                for (int i = 0; i < tile_len; i++)
                {
                    cJSON *loop = cJSON_CreateNumber(ubs[i]);
                    cJSON_AddItemToArray(loops_json, loop);
                }
                p_str = isl_printer_to_str(sa->ctx);
                p_str = isl_printer_print_str(p_str, sa->options->autosa->output_dir);
                p_str = isl_printer_print_str(p_str, "/tuning.json");
                tuning_path = isl_printer_get_str(p_str);
                fp = fopen(tuning_path, "w");
                content = cJSON_Print(tuning);
                fprintf(fp, "%s", content);
                cJSON_Delete(tuning);
                isl_printer_free(p_str);
                free(tuning_path);
                exit(0);
            }
        }
        else
        {
            /* Perform the latency hiding following the default policy. */
            tile_size = read_default_latency_tile_sizes(sa, tile_len);
        }
        free(data.ubs);
    }    

    if (!tile_size)
    {
        isl_schedule_node_free(node);        
        return NULL;
    }

    /* Examine if all the tiling factors are 1, in that case, we will
     * skip the tiling and split off the last time dimension to add a 
     * hls_pipeline mark. */
    for (i = 0; i < tile_len; i++)
    {
        if (tile_size[i] != -1)
            sa->lat_hide_len *= tile_size[i];
    }
    for (i = 0; i < tile_len; i++)
    {
        if (tile_size[i] > 1)
            break;
    }
    if (i == tile_len)
    {
        node = isl_schedule_node_map_descendant_bottom_up(node,
                                                          &add_hls_pipeline, sa);
    }
    else
    {
        /* Tile the candidate loops. */
        struct autosa_pe_opt_tile_data tile_data = {0, 0, tile_len, tile_size, sa};
        while (tile_data.n_touched_loop != tile_len)
        {
            node = isl_schedule_node_map_descendant_bottom_up(
                node, &autosa_latency_tile_band_loop, &tile_data);
        }
    }
    
    free(tile_size);
    return node;
}

/* Apply latency hiding. 
 * Go through all the loops, if there is any parallel loop (considering only RAW), 
 * such a loop will be identified as latency hiding loop candidate. 
 * Such loops will be tiled. The point loops will be permuted as 
 * the innermost time loops.
 * 
 * en: enable signal for the current stage.
 * mode: manual/auto
 */
isl_stat sa_latency_hiding_optimize(struct autosa_kernel *sa, bool en, char *mode)
{
    isl_bool opt_required;
    isl_schedule *schedule = sa->schedule;
    isl_schedule_node *node = isl_schedule_get_root(schedule);

    if (!en)
    {
        /* This stage is disabled.
         * We will peel off the last time loop and add an hls_pipeline mark as 
         * the innermost time loops are supposed to be pipelined on hardware. 
         */
        node = isl_schedule_node_map_descendant_bottom_up(node,
                                                          &add_hls_pipeline, sa);

        isl_schedule_free(sa->schedule);
        sa->schedule = isl_schedule_node_get_schedule(node);
        isl_schedule_node_free(node);
        return isl_stat_ok;
    }

    printf("[AutoSA] Apply latency hiding.\n");
    sa->lat_hide_len = 1;

    /* Move down to the array marker. */
    node = autosa_tree_move_down_to_array(node, sa->core);

    /* Check if the innermost time loop is parallel loop.
     * If so, there is no need to perform latency hiding, safely reutrn.
     */
    struct latency_opt_check_data data;
    data.kernel = sa;
    data.is_required = 1;    
    node = isl_schedule_node_map_descendant_bottom_up(node,
                                                      &latency_opt_check, &data);
    if (!data.is_required)
    {             
        printf("[AutoSA] The innermost time loop is parallel. Latency hiding is optional.\n");
    }

    /* Detect all candidate loops. */
    node = isl_schedule_node_map_descendant_bottom_up(
        node, &detect_latency_hiding_loop, sa);

    /* Display the candidate loops. */
    isl_schedule_free(schedule);
    schedule = isl_schedule_node_get_schedule(node);
    if (sa->scop->options->autosa->verbose)
    {
        isl_printer *p = isl_printer_to_file(sa->ctx, stdout);
        p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
        p = isl_printer_print_schedule(p, schedule);
        printf("\n");
        isl_printer_free(p);
    }
    isl_schedule_free(schedule);

    /* Tile the candidate loop. 
     * For each candidate loop, if the loop is used for latency hiding,
     * it is tiled and permuted to the innermost of the time loop band. 
     * A latency hiding marker is added. */
    node = autosa_latency_tile_loop(node, sa, mode);

    /* Clean up the band pe_opt properties. */
    schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &clear_pe_opt_prop, NULL);

    sa->schedule = schedule;    

    return isl_stat_ok;
}

/* Internal struct used in SIMD vectorization. */
struct simd_vectorization_data
{
    struct autosa_kernel *kernel;
    float *scores;
    int *legal;
    float best_score;
    int layout_trans;
    int n_loops;
    int loop_cnt;
    char *mode;
    int *ubs;
    int *tile_size;
    char *buffer;
    int buffer_offset;
    int has_space_candidate;
    int n_legal_loops;
};

/* Internal struct used in is_stride_coalesced. */
struct stride_coalesced_data
{
    struct autosa_kernel *kernel;
    isl_union_map *prefix;
    float score;
    float num_accs;
    float num_layout_trans;
};

/* Examine if all the array references of the statement with the domain "set" 
 * has stride-0/stride-1 access.
 */
static isl_bool is_stride_coalesced_stmt(__isl_keep isl_set *set, void *user)
{
    isl_space *space;
    isl_id *id;
    struct autosa_stmt *stmt;
    struct stride_coalesced_data *data = (struct stride_coalesced_data *)user;
    struct autosa_stmt_access *accesses, *access;
    isl_map *prefix;

    space = isl_set_get_space(set);
    id = isl_space_get_tuple_id(space, isl_dim_set);
    isl_space_free(space);
    prefix = isl_map_from_union_map(isl_union_map_intersect_domain(
        isl_union_map_copy(data->prefix), isl_union_set_from_set(isl_set_copy(set))));
    stmt = find_stmt(data->kernel->prog, id);
    isl_id_free(id);
    accesses = stmt->accesses;
    for (access = accesses; access; access = access->next)
    {
        isl_map *acc;
        int n;
        isl_bool is_zero = isl_bool_false, is_one = isl_bool_false;
        isl_pw_multi_aff *pma;
        int i;

        /* Skip the scalar access */
        if (access->n_index == 0)
            continue;

        /* Transform the domain of access function to scheduling domains. */
        acc = isl_map_copy(access->access);
        acc = isl_map_apply_domain(acc, isl_map_copy(prefix));

        /* Try each dimension of the array. */
        for (i = access->n_index - 1; i >= 0; i--)
        {
            is_zero = access_is_stride_zero(acc, i);
            if (is_zero)
                break;
        }
        if (!is_zero)
        {
            for (i = access->n_index - 1; i >= 0; i--)
            {
                is_one = access_is_stride_one(acc, i);
                if (is_one)
                    break;
            }
        }

        isl_map_free(acc);

        if (!(is_zero || is_one))
        {
            isl_map_free(prefix);
            return isl_bool_false;
        }
        else
        {
            /* Log if layout transformation is required and the dim to be permuted. */
            if (i == access->n_index - 1)
            {
                access->layout_trans = 0;
                access->simd_dim = i;
            }
            else
            {
                access->layout_trans = 1;
                access->simd_dim = i;
            }
            /* Update the score. */
            data->score = data->score + (1 - access->layout_trans);
            data->num_accs = data->num_accs + 1;
            data->num_layout_trans = data->num_layout_trans + access->layout_trans;
        }
    }

    isl_map_free(prefix);
    return isl_bool_true;
}

/* This function examines if the access function of the statements under 
 * the current "node" has only stride-0/1 access.
 */
static isl_bool is_stride_coalesced_at_node(__isl_keep isl_schedule_node *node,
                                            void *user)
{
    struct stride_coalesced_data *data = (struct stride_coalesced_data *)user;
    struct autosa_kernel *kernel = data->kernel;
    isl_union_set *domain;
    isl_union_map *prefix;
    isl_bool one_or_zero;

    if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
        return isl_bool_true;

    domain = isl_schedule_node_get_domain(node);
    prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
    data->prefix = prefix;

    /* Examine each statment under the loop */
    one_or_zero = isl_union_set_every_set(domain, &is_stride_coalesced_stmt, data);

    isl_union_map_free(data->prefix);
    isl_union_set_free(domain);

    return one_or_zero;
}

/* This function examines if all the array references under the current "node"
 * are stride-0/stride-1.
 * We also give a score to the loop calculated by:
 * score = Sum_{all_array_references_under_the_loop}{
 *           (is_access_stride-0/1 * (1 - is_layout_transformation_required)              
 *              + num_of_accs / num_of_required_layout_transform}
 * When examining each array reference, we will try all different layout by 
 * permuting each array dimension innermost to make sure we don't miss any
 * opportunity. 
 * When layout transformation is required, we will log the dimension to be 
 * permuted innermost.
 * The calculated score is returned.
 */
static float is_stride_coalesced(__isl_keep isl_schedule_node *node,
                                 struct autosa_kernel *kernel, int *layout_transform)
{
    float score = 0;
    struct stride_coalesced_data data;
    isl_bool coalesced;

    data.kernel = kernel;
    data.score = score;
    data.num_accs = 0;
    data.num_layout_trans = 0;
    coalesced = isl_schedule_node_every_descendant(node,
                                                   &is_stride_coalesced_at_node, &data);

    /* We penalize the loop with more layout transformation required. */
    if (data.num_layout_trans == 0)
    {
        data.score += (data.num_accs + 1);
    }
    else
    {
        data.score += (data.num_accs / data.num_layout_trans);
    }

    /* Examine and make sure all the array references of the same array 
     * have the same dimenison for layout transformation.
     */
    if (coalesced)
    {
        struct autosa_kernel *kernel = data.kernel;
        for (int i = 0; i < kernel->n_array; i++)
        {
            struct autosa_local_array_info *local_array;
            int simd_dim = -1;
            local_array = &kernel->array[i];
            for (int j = 0; j < local_array->array->n_ref; j++)
            {
                struct autosa_stmt_access *acc = local_array->array->refs[j];
                if (acc->layout_trans == 1)
                {
                    if (simd_dim == -1)
                        simd_dim = acc->simd_dim;
                    else
                    {
                        if (simd_dim != acc->simd_dim)
                        {
                            coalesced = isl_bool_false;
                            return coalesced ? data.score : -1;
                        }
                    }
                }
            }
        }
    }

    /* Print out the layout transform information. */
    if (coalesced)
    {
        struct autosa_kernel *kernel = data.kernel;
        isl_printer *p;

        p = isl_printer_to_file(kernel->ctx, stdout);
        for (int i = 0; i < kernel->n_array; i++)
        {
            struct autosa_local_array_info *local_array;
            local_array = &kernel->array[i];
            for (int j = 0; j < local_array->array->n_ref; j++)
            {
                struct autosa_stmt_access *acc = local_array->array->refs[j];

                if (acc->layout_trans != -1)
                {
                    if (acc->layout_trans == 1)
                    {
                        printf("[AutoSA] Array reference ");
                        if (acc->read)
                            printf("(R): ");
                        else
                            printf("(W): ");
                        p = isl_printer_print_map(p, acc->access);
                        printf("\n");
                        printf("[AutoSA] Layout transform: Permute dim (%d) to the innermost\n", acc->simd_dim);
                        *layout_transform = 1;
                    }
                    acc->layout_trans = -1;
                    acc->simd_dim = -1;
                }
            }
        }
        isl_printer_free(p);
    }

    return coalesced ? data.score : -1;
}

/* A loop is identified to be vectorizable if it is:
 * - a parallel or reduction loop
 * - with stride-0/1 access.
 * Only time loops are considered.
 * For each candidate loop, we compute the score:
 * score = 2 * is_loop_parallel + 4 * is_loop_reduction)
 *           + Sum_{all_array_references_under_the_loop}{
 *              (is_access_stride-0/1 * (1 - is_layout_transformation_required)
 *              + num_of_accs / num_of_required_layout_transform}
 * The heuristics are:
 * - We prefer reduction loop to parallel loop. 
 * - We prefer array references without requirements of layout transformation.
 */
static isl_schedule_node *detect_simd_vectorization_loop(
    __isl_take isl_schedule_node *node, void *user)
{
    struct simd_vectorization_data *data = (struct simd_vectorization_data *)user;
    struct autosa_kernel *sa = data->kernel;
    isl_ctx *ctx = isl_schedule_node_get_ctx(node);
    float score;
    isl_schedule_node *cur_node;
    int is_latency;
    int n_member;
    int simd_touch_space;

    /* If the currrent node is under the latency mark, return
     * as we don't use latency hiding loop as candidates. 
     */
    is_latency = is_node_under_latency(node);
    if (is_latency)
        return node;

    simd_touch_space = sa->options->autosa->simd_touch_space;    

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        n_member = isl_schedule_node_band_n_member(node);
        for (int i = 0; i < n_member; i++)
        {
            if (!simd_touch_space && isl_schedule_node_band_member_get_space_time(node, i) != autosa_loop_time) {
                /* We consider only time loops */
                continue;
            } else {
                /* We consider both space and time loops */            
                /* Two types of loops that we are interested in:
                 * - Parallel loop.
                 * - Reduction loop in the innermost loop band.
                 *   This limit is currently relaxed, we will look at all loop bands 
                 *   for reduction loops as the current isl dep analysis can't 
                 *   differentiate reduction dependences and might seperate one 
                 *   permutable loop band into two loop bands.
                 */
                int is_parallel = 0;
                int is_reduction = 0;
                int layout_transform = 0;
                float score_i;

                if (!isl_schedule_node_band_member_get_coincident(node, i) && !strcmp(data->mode, "manual"))
                {
                    /* At present, we can't analyze reduction loop by AutoSA.
                     * We will print each node and follow the user guidance.
                     * Besides, reduction loops are only examined in the manual mode.
                     * In the auto mode, only parallel loops are examined.
                     */
                    size_t bufsize = 100;
                    size_t characters;
                    printf("[AutoSA] Detecting the reduction loop.\n");
                    printf("[AutoSA] Band member position: %d\n", i);
                    /* If the SIMD info is pre-loaded, we don't ask for user inputs. */
                    if (data->buffer == NULL)
                    {
                        isl_printer *p;
                        p = isl_printer_to_file(ctx, stdout);
                        p = isl_printer_end_line(p);
                        p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
                        p = isl_printer_print_schedule_node(p, node);
                        isl_printer_free(p);
                        printf("[AutoSA] Please input if the current loop is a reduction loop [y/n]: ");
                    }
                    if (data->buffer == NULL)
                    {
                        char *buffer = (char *)malloc(bufsize * sizeof(char));
                        data->buffer = buffer;
                        data->buffer_offset = 0;
                        characters = getline(&buffer, &bufsize, stdin);
                    }
                    printf("[AutoSA] Reduction property: %c\n", data->buffer[data->buffer_offset]);
                    is_reduction = (data->buffer[data->buffer_offset] == 'y') ? 1 : 0;
                    if (data->buffer[data->buffer_offset + 1] == 'y' ||
                        data->buffer[data->buffer_offset + 1] == 'n')
                    {
                        data->buffer_offset += 1;
                    }
                    else
                    {
                        free(data->buffer);
                        data->buffer = NULL;
                        data->buffer_offset = 0;
                    }
                }
                else
                {
                    is_parallel = isl_schedule_node_band_member_get_coincident(node, i);
                }

                /* Test if all the array references under the current loop 
                 * has only stride-0/1 access. 
                 */
                if (is_parallel || is_reduction)
                {
                    cur_node = node;
                    node = isl_schedule_node_dup(cur_node);

                    if (i > 0)
                    {
                        node = isl_schedule_node_band_split(node, i);
                        node = isl_schedule_node_child(node, 0);
                    }
                    if (n_member - i - 1 > 0)
                    {
                        node = isl_schedule_node_band_split(node, 1);
                    }

                    /* Sink the band innermost. */
                    node = isl_schedule_node_band_sink(node);
                    score = 2 * is_parallel + 4 * is_reduction;
                    printf("[AutoSA] -----------------------------------------------\n");
                    printf("[AutoSA] Current band member position: %d\n", i);
                    printf("[AutoSA] -----------------------------------------------\n");
                    score_i = is_stride_coalesced(node, sa, &layout_transform);
                    isl_schedule_node_free(node);
                    node = cur_node;
                    if (score_i < 0)
                    {
                        /* The array references are not coalesced. */
                        score = -1;
                        continue;
                    }
                    else
                    {
                        score += score_i;
                        printf("[AutoSA] -----------------------------------------------\n");
                        printf("[AutoSA] The loop is legal to be vectorized with score: %f\n",
                               score);
                        if (layout_transform)
                            printf("[AutoSA] Layout transformation is required to proceed.\n");
                        printf("[AutoSA] -----------------------------------------------\n");
                        node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_simd);
                        if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space)
                            data->has_space_candidate = 1;

                        if (score >= data->best_score)
                        {
                            data->best_score = score;
                            data->layout_trans = layout_transform;
                        }
                        data->n_loops = data->n_loops + 1;
                        data->scores = (float *)realloc(data->scores, sizeof(float) * data->n_loops);
                        data->scores[data->n_loops - 1] = score;
                        data->legal = (int *)realloc(data->legal, sizeof(int) * data->n_loops);
                        data->legal[data->n_loops - 1] = !layout_transform;
                        if (!layout_transform) 
                            data->n_legal_loops++;

                        /* Extract the loop upper bounds */
                        int *ubs = extract_band_upper_bounds(node);
                        data->ubs = (int *)realloc(data->ubs, sizeof(int) * data->n_loops);
                        data->ubs[data->n_loops - 1] = ubs[i];
                        free(ubs);
                    }
                }
            }
        }
    }

    return node;
}

/* Examine if the node is the last band node, 
 * If so, add a "simd" mark before the node. */
static __isl_give isl_schedule_node *add_simd_mark(
    __isl_take isl_schedule_node *node, void *user)
{
    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        node = isl_schedule_node_child(node, 0);
        isl_bool no_inner_band = isl_schedule_node_every_descendant(node,
                                                                    &no_permutable_node, NULL);
        node = isl_schedule_node_parent(node);
        if (no_inner_band)
        {
            /* Insert the "simd" mark. */
            isl_id *id = isl_id_alloc(isl_schedule_node_get_ctx(node), "simd", NULL);
            node = isl_schedule_node_insert_mark(node, id);
        }
    }

    return node;
}

/* Update the stride information for the array accesses under the SIMD loop.
 */
static isl_bool update_simd_acc_stmt(__isl_keep isl_set *set, void *user)
{
    struct stride_coalesced_data *data = (struct stride_coalesced_data *)user;
    struct autosa_stmt *stmt;
    isl_space *space;
    isl_id *id;
    struct autosa_stmt_access *accesses, *access;
    isl_map *prefix;

    space = isl_set_get_space(set);
    id = isl_space_get_tuple_id(space, isl_dim_set);
    isl_space_free(space);
    stmt = find_stmt(data->kernel->prog, id);
    isl_id_free(id);
    accesses = stmt->accesses;
    prefix = isl_map_from_union_map(isl_union_map_intersect_domain(
        isl_union_map_copy(data->prefix), isl_union_set_from_set(isl_set_copy(set))));

    for (access = accesses; access; access = access->next)
    {
        isl_map *acc;
        int n;
        isl_bool is_zero = isl_bool_false, is_one = isl_bool_false;
        isl_pw_multi_aff *pma;
        int i;

        if (access->n_index == 0)
            continue;

        acc = isl_map_copy(access->access);
        acc = isl_map_apply_domain(acc, isl_map_copy(prefix));

        for (i = access->n_index - 1; i >= 0; i--)
        {
            is_zero = access_is_stride_zero(acc, i);
            if (is_zero)
                break;
        }
        if (!is_zero)
        {
            is_one = isl_bool_true;
        }

        isl_map_free(acc);
        access->simd_stride = is_zero ? 0 : (is_one ? 1 : -1);
    }

    isl_map_free(prefix);
    return isl_bool_true;
}

/* Update the stride information for the array accesses under the SIMD loop.
 */
static isl_bool update_simd_acc(__isl_keep isl_schedule_node *node, void *user)
{
    isl_union_set *domain;
    isl_union_map *prefix;
    struct stride_coalesced_data *data = (struct stride_coalesced_data *)user;

    if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
        return isl_bool_true;

    domain = isl_schedule_node_get_domain(node);
    prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
    data->prefix = prefix;

    isl_union_set_every_set(domain, &update_simd_acc_stmt, data);

    isl_union_set_free(domain);
    isl_union_map_free(prefix);

    return isl_bool_true;
}

/* This function tiles the SIMD loop.
 * If it is executed in the auto mode, it will select the loop with the 
 * highest score.
 * Otherwise, it will select loops with positive tiling factors.
 * Loops with tiling factors of one or require layout transformation are skipped.
 * At last, it will also update the stride information for the array accesses
 * under the SIMD loop.
 */
static __isl_give isl_schedule_node *autosa_simd_tile_loop(
    __isl_take isl_schedule_node *node, void *user)
{
    struct simd_vectorization_data *data = (struct simd_vectorization_data *)user;
    struct autosa_kernel *kernel = data->kernel;
    struct stride_coalesced_data stride_data;
    stride_data.kernel = data->kernel;

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
    {
        for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
        {
            if (isl_schedule_node_band_member_get_pe_opt(node, i) == autosa_loop_simd)
            {
                if (!strcmp(data->mode, "auto"))
                {
                    /* Perform tiling on the loop with the highest score. */
                    if (data->scores[data->loop_cnt] != data->best_score)
                    {
                        node = isl_schedule_node_band_member_set_pe_opt(node, i,
                                                                        autosa_loop_default);
                        data->loop_cnt++;
                        continue;
                    }
                }
                else
                {
                    /* Peform tiling on the loop with positive tiling factor */
                    if (data->tile_size[data->loop_cnt] <= 0)
                    {
                        node = isl_schedule_node_band_member_set_pe_opt(node, i,
                                                                        autosa_loop_default);
                        data->loop_cnt++;
                        continue;
                    }
                }
                if (data->tile_size[data->loop_cnt] == 1)
                {
                    /* Skip if the tiling factor is one. */
                    node = isl_schedule_node_band_member_set_pe_opt(node, i,
                                                                    autosa_loop_default);
                    data->loop_cnt++;
                    continue;
                }
                if (data->legal[data->loop_cnt] == 0)
                {
                    /* Layout transformation is needed to proceed.
                     * We will skip this loop. 
                     */
                    node = isl_schedule_node_band_member_set_pe_opt(node, i,
                                                                    autosa_loop_default);
                    data->loop_cnt++;
                    continue;
                }
                
                int tile_size = data->tile_size[data->loop_cnt];
                
                /* If SIMD vectorization is applied on the space loops, we need to update
                 * the SA dimensions.
                 */
                if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space) {
                    /* Figure out the dim position */
                    int touched_space_loop = 0;
                    for (int j = 0; j < i; j++) {
                        if (isl_schedule_node_band_member_get_space_time(node, j) == autosa_loop_space)
                            touched_space_loop++;
                    }                                        
                    data->kernel->sa_dim[touched_space_loop] /= tile_size;
                    if (data->kernel->sa_dim[touched_space_loop] == 1) {
                        throw std::runtime_error("[AutoSA] Error: Array dimension as 1 is not supported!");
                    }
                }                
                /* Tile the loop */
                node = autosa_node_band_tile_loop(node, tile_size, i);                
                /* Reset the candidate loop in the tile loop the pe_opt property to default */
                node = isl_schedule_node_band_member_set_pe_opt(node, i, autosa_loop_default);
                /* Reset the point loop space_time property to time loop. */
                node = isl_schedule_node_child(node, 0);
                node = isl_schedule_node_band_member_set_space_time(node, 0, autosa_loop_time);
                /* Reset the point loop pe_opt property to default. */
                node = isl_schedule_node_band_member_set_pe_opt(node, 0, autosa_loop_default);                
                if (data->kernel->scop->options->autosa->tuning_method == 1) {
                    node = isl_schedule_node_parent(node);
                    node = data->kernel->tuning_program->tile(node, i, 1, "SIMD", {"power_of_two"}, 32/data->kernel->array[0].array->size);
                    node = isl_schedule_node_child(node, 0);
                }
                /* Sink the point loop innermost */
                if (kernel->options->autosa->isl_sink) {
                    node = isl_schedule_node_band_sink(node);
                    /* Add the simd marker */
                    node = isl_schedule_node_map_descendant_bottom_up(node, &add_simd_mark, NULL);
                }
                else {
                    /* Sink the point loop innermost and add the simd marker */
                    node = autosa_node_sink_to_mark(node, "simd");
                }
                /* Update the stride information for array references under the SIMD loop. */
                isl_schedule_node_every_descendant(node, &update_simd_acc, &stride_data);                

                node = isl_schedule_node_parent(node);
                kernel->simd_w = tile_size;
                data->loop_cnt++;
                printf("[AutoSA] SIMD vectorization successfully applied.\n");
            }
        }
    }

    return node;
}

/* Load the SIMD information for the kernel. 
 */
static __isl_give char *load_simd_info(struct autosa_kernel *sa)
{
    cJSON *simd_info;
    FILE *f;
    char *buffer = NULL;
    long length;

    if (sa->options->autosa->simd_info)
    {
        f = fopen(sa->options->autosa->simd_info, "rb");
        if (f)
        {
            fseek(f, 0, SEEK_END);
            length = ftell(f);
            fseek(f, 0, SEEK_SET);
            buffer = (char *)malloc(length + 1);
            if (buffer)
            {
                buffer[length] = '\0';
                int r = fread(buffer, 1, length, f);
            }
            fclose(f);
        }
        else
        {
            printf("[AutoSA] Error: Can't open SIMD information file: %s\n",
                   sa->options->autosa->simd_info);
            exit(1);
        }
    }

    if (buffer)
    {
        simd_info = cJSON_Parse(buffer);
        free(buffer);
        /* Load the SIMD info into a string. */
        cJSON *reduction = NULL;
        cJSON *reductions = NULL;
        int info_id = 0;
        char kernel_name[20];
        sprintf(kernel_name, "kernel%d", sa->space_time_id);        
        reductions = cJSON_GetObjectItemCaseSensitive(simd_info, kernel_name);
        if (reductions)
        {
            char *info = (char *)malloc(100 * sizeof(char));
            reductions = cJSON_GetObjectItemCaseSensitive(reductions, "reduction");
            cJSON_ArrayForEach(reduction, reductions)
            {
                char *info_i = reduction->valuestring;
                sprintf(info + info_id, "%c", info_i[0]);
                info_id++;
            }
            cJSON_Delete(simd_info);
            return info;
        }
        else
        {
            cJSON_Delete(simd_info);
            return NULL;
        }
    }
    return NULL;
}

/* Apply SIMD vectorization. 
 * We go through all the loops, if there is any vectorizable loop 
 * (parallel or reduction loop with stride-0/1 access), such a loop will 
 * be identified as SIMD loop candidates. We will rank the loops by heuristics 
 * and pick up one loop with the highest score to be tiled. 
 * The point loop will be permuated as the innermost loops.
 * At last this loop with be unrolled by HLS tools.
 */
isl_stat sa_simd_vectorization_optimize(struct autosa_kernel *sa, char *mode)
{
    float *scores = NULL;
    int n_loops = 0;
    struct simd_vectorization_data data;
    data.best_score = 0;
    data.mode = mode;
    data.ubs = NULL;
    int *tile_size = NULL;

    printf("[AutoSA] Apply SIMD vectorization.\n");
    isl_schedule *schedule = sa->schedule;
    isl_schedule_node *node = isl_schedule_get_root(schedule);
    sa->simd_w = 1;

    /* Move down to the array marker */
    node = autosa_tree_move_down_to_array(node, sa->core);

    /* Detect all candidate loops */
    data.kernel = sa;
    data.scores = scores;
    data.legal = NULL;
    data.buffer = NULL;
    data.buffer_offset = 0;
    data.n_loops = n_loops;
    data.n_legal_loops = 0;
    data.has_space_candidate = 0;
    /* Load the SIMD information. */
    data.buffer = load_simd_info(sa);
    node = isl_schedule_node_map_descendant_bottom_up(
        node, &detect_simd_vectorization_loop, &data);

    if (data.n_loops == 0)
    {
        printf("[AutoSA] No candidate loops found!\n");
        isl_schedule_node_free(node);
        return isl_stat_ok;
    }

    /* Display the candidate loops. */
    isl_schedule_free(schedule);
    schedule = isl_schedule_node_get_schedule(node);
    if (sa->scop->options->autosa->verbose)
    {
        isl_printer *p = isl_printer_to_file(sa->ctx, stdout);
        p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
        p = isl_printer_print_schedule(p, schedule);
        printf("\n");
        isl_printer_free(p);
    }
    isl_schedule_free(schedule);
    
    if (data.n_legal_loops == 0) {
        printf("[AutoSA] No legal SIMD loop is fonud. SIMD vectorization is skipped.\n");
    }
    else {
        /* Select the candidate loop with the highest score.
         * Tile the candidate loop and permute the point loop innermost. 
         * A SIMD vectorization marker is added. 
         */
        if (sa->scop->options->autosa->tuning_method == 1) {
            /* Select one tiling factor in between (1, ub).
             * Avoid 1 as such a tiling factor will be skipped and the AST loop will
             * be degenerated.
             * Avoid ub as generating space dim with 1 is not supported.
             */
            tile_size = data.ubs;
            for (int i = 0; i < data.n_loops; i++) {
                if (data.scores[i] == data.best_score) {
                    std::vector<int> factors = get_factors(tile_size[i]);
                    if (factors.size() < 3) {
                        printf("[AutoSA] Error: Cannot find legal tiling factors for auto-tuning template!\n");
                        exit(1);
                    }
                    tile_size[i] = factors[1];
                } else {
                    tile_size[i] = 1;
                }
            }            
        } else {
            if (!strcmp(mode, "manual"))
            {
                tile_size = read_simd_tile_sizes(sa, data.n_loops);
                if (!tile_size)
                {
                    /* Dump out the number, score and upper bounds of simd loops 
                     * and exit the program. 
                     */
                    int *ubs = data.ubs;
                    FILE *fp;
                    char *content;
                    cJSON *tuning, *simd_json, *loops_json, *scores_json, *legal_json;
                    isl_printer *p_str;
                    char *tuning_path;

                    tuning = cJSON_CreateObject();
                    simd_json = cJSON_CreateObject();
                    cJSON_AddItemToObject(tuning, "simd", simd_json);
                    loops_json = cJSON_CreateArray();
                    cJSON_AddItemToObject(simd_json, "tilable_loops", loops_json);
                    for (int i = 0; i < data.n_loops; i++)
                    {
                        cJSON *loop = cJSON_CreateNumber(ubs[i]);
                        cJSON_AddItemToArray(loops_json, loop);
                    }
                    scores_json = cJSON_CreateArray();
                    cJSON_AddItemToObject(simd_json, "scores", scores_json);
                    for (int i = 0; i < data.n_loops; i++)
                    {
                        cJSON *loop = cJSON_CreateNumber(data.scores[i]);
                        cJSON_AddItemToArray(scores_json, loop);
                    }
                    legal_json = cJSON_CreateArray();
                    cJSON_AddItemToObject(simd_json, "legal", legal_json);
                    for (int i = 0; i < data.n_loops; i++)
                    {
                        cJSON *loop = cJSON_CreateNumber(data.legal[i]);
                        cJSON_AddItemToArray(legal_json, loop);
                    }
                    if (data.has_space_candidate == 0) {
                        loops_json = cJSON_CreateArray();
                        cJSON_AddItemToObject(simd_json, "sa_dims", loops_json);
                        for (int i = 0; i < sa->n_sa_dim; i++)
                        {
                            cJSON *loop = cJSON_CreateNumber(sa->sa_dim[i]);
                            cJSON_AddItemToArray(loops_json, loop);
                        }
                    }
                    p_str = isl_printer_to_str(sa->ctx);
                    p_str = isl_printer_print_str(p_str, sa->options->autosa->output_dir);
                    p_str = isl_printer_print_str(p_str, "/tuning.json");
                    tuning_path = isl_printer_get_str(p_str);
                    fp = fopen(tuning_path, "w");
                    content = cJSON_Print(tuning);
                    fprintf(fp, "%s", content);
                    cJSON_Delete(tuning);
                    free(tuning_path);
                    isl_printer_free(p_str);
                    exit(0);
                }
            }
            else
            {
                throw std::runtime_error("[AutoSA] Error: Auto SIMD vectorization is not supported.\n");
            }
            free(data.ubs);
        }

        /* Perform the simd vectorization. */
        data.loop_cnt = 0;
        data.tile_size = tile_size;
        node = isl_schedule_node_map_descendant_bottom_up(node,
                                                          &autosa_simd_tile_loop, &data);
    }
    
    free(data.legal);
    free(tile_size);
    /* Clean up the band pe_opt properties. */
    schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    schedule = isl_schedule_map_schedule_node_bottom_up(
        schedule, &clear_pe_opt_prop, NULL);
    free(data.scores);
    sa->schedule = schedule;

    /* Update the tuning config, dump out the sa dimensions. */
    if (data.has_space_candidate)
    {
        cJSON *tuning, *loops_json;
        isl_printer *p_str;
        char *tuning_path;
        char *content;
        FILE *fp;

        tuning = cJSON_CreateObject();
        loops_json = cJSON_CreateArray();
        cJSON_AddItemToObject(tuning, "sa_dims", loops_json);
        for (int i = 0; i < sa->n_sa_dim; i++) {
            cJSON *loop = cJSON_CreateNumber(sa->sa_dim[i]);
            cJSON_AddItemToArray(loops_json, loop);
        }
        p_str = isl_printer_to_str(sa->ctx);
        p_str = isl_printer_print_str(p_str, sa->options->autosa->output_dir);
        p_str = isl_printer_print_str(p_str, "/tuning.json");
        tuning_path = isl_printer_get_str(p_str);
        fp = fopen(tuning_path, "w");
        content = cJSON_Print(tuning);
        fprintf(fp, "%s", content);
        free(content);
        cJSON_Delete(tuning);
        free(tuning_path);
        isl_printer_free(p_str);
    }

    /* Check if any of the space dimension is one, which is not supported by the current AutoSA. */
    for (int i = 0; i < sa->n_sa_dim; i++) {
        //std::cout << sa->n_sa_dim << std::endl;
        //std::cout << sa->sa_dim[i] << std::endl;
        if (sa->sa_dim[i] == 1) {            
            throw std::runtime_error("[AutoSA] Error: Array dimension as 1 is not supported!");
        }
    }

    return isl_stat_ok;
}

/* Apply PE optimization including:
 * - latency hiding
 * - SIMD vectorization
 * - array partitioning
 */
isl_stat compute_management(
    struct autosa_gen *gen,
    struct autosa_kernel *sa, bool pass_en[], char *pass_mode[])
{
    printf("[AutoSA] Apply compute management.\n");    

    /* Prepartion before the optimization. */
    /* Initialize the autosa_loop_types. */
    sa_loop_init(sa);
    /* Set up the space_time properties. */
    sa_space_time_loop_setup(sa);    
    /* Extract the communication pairs. */
    sa_io_update(sa);    

    /* If any of the space dimensions are not parallel, 
     * check if local_reduce is enabled, otherwise error out.
     */
    //if (gen->options->autosa->tuning_method != 1) {
    //    for (int i = 0; i < sa->n_sa_dim; i++) {        
    //        if (sa->space_parallel[i] == 0 && !gen->options->autosa->local_reduce) {
    //            throw std::runtime_error("[AutoSA] Error: Detected non-parallel space loops which is not supported unless local-reduce is specified.");
    //        }
    //    }
    //}

    /* Extract the tile sizes. */
    sa->sizes = extract_sizes_from_str(sa->ctx, sa->scop->options->autosa->sa_sizes);
    /* Set the core */
    isl_union_set *domain = isl_schedule_get_domain(sa->schedule);
    sa->core = isl_union_set_universe(domain);
    /* Array partitioning. */
    sa_array_partitioning_optimize(sa, pass_en[0], pass_mode[0], pass_en[1], pass_mode[1]);    
    /* Dump out the intermediate code if needed */
    if (gen->options->autosa->dump_code) {
        dump_intermediate_code(gen, isl_schedule_copy(sa->schedule), "array_part");
    }
    /* Latency hiding. */
    sa_latency_hiding_optimize(sa, pass_en[2], pass_mode[2]);    
    if (gen->options->autosa->dump_code) {
        dump_intermediate_code(gen, isl_schedule_copy(sa->schedule), "latency");
    }
    /* SIMD vectorization. */
    if (pass_en[3]) {
        sa_simd_vectorization_optimize(sa, pass_mode[3]);    
        if (gen->options->autosa->dump_code) {
            dump_intermediate_code(gen, isl_schedule_copy(sa->schedule), "simd");
        }
    }

    return isl_stat_ok;
}

/* Extract the set of parameter values and outer schedule dimensions
 * for which any statement instance
 * in the kernel inserted at "node" needs to be executed.
 * Intersect the set of parameter values derived from the host schedule
 * relation with the context of "prog".
 */
static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node,
                                           struct autosa_prog *prog)
{
    isl_union_map *schedule;
    isl_union_set *schedule_domain;
    isl_set *context;
    int empty;

    schedule = isl_schedule_node_get_prefix_schedule_relation(node);
    schedule_domain = isl_union_map_range(schedule);
    empty = isl_union_set_is_empty(schedule_domain);
    if (empty < 0)
    {
        isl_union_set_free(schedule_domain);
        return NULL;
    }
    if (empty)
    {
        int depth;
        isl_space *space;

        space = isl_union_set_get_space(schedule_domain);
        isl_union_set_free(schedule_domain);
        space = isl_space_set_from_params(space);
        depth = isl_schedule_node_get_schedule_depth(node);
        space = isl_space_add_dims(space, isl_dim_set, depth);
        context = isl_set_empty(space);
    }
    else
    {
        context = isl_set_from_union_set(schedule_domain);
    }
    context = isl_set_intersect_params(context,
                                       isl_set_copy(prog->context));

    return context;
}

/* Return the set of outer array elements accessed by
 * by the statement instances in "domain" in "prog".
 * The instances in "domain" are those that appear
 * in the domains of the access relations in "prog".
 */
static __isl_give isl_union_set *accessed_by_domain(
    __isl_take isl_union_set *domain, struct autosa_prog *prog)
{
    isl_union_map *access;
    isl_union_set *arrays;

    access = isl_union_map_union(isl_union_map_copy(prog->read),
                                 isl_union_map_copy(prog->may_write));
    access = isl_union_map_intersect_domain(access, domain);
    arrays = isl_union_map_range(access);
    arrays = isl_union_set_apply(arrays,
                                 isl_union_map_copy(prog->to_outer));

    return arrays;
}

/* Compute the effective grid size as a list of the sizes in each dimension.
 *
 * The grid size specified by the user or set by default
 * in read_grid_sizes() and applied by the block filter,
 * may be too large for the given code in the sense that
 * it may contain blocks that don't need to execute anything.
 * We therefore don't return this grid size, but instead the
 * smallest grid size that ensures that all blocks that actually
 * execute code are included in the grid.
 *
 * We first extract a description of the grid, i.e., the possible values
 * of the block ids, from the domain elements in "domain" and
 * kernel->block_filter.
 * The block ids are parameters in kernel->block_filter.
 * We simply need to change them into set dimensions.
 *
 * Then, for each block dimension, we compute the maximal value of the block id
 * and add one.
 */
static __isl_give isl_multi_pw_aff *extract_grid_size(
    struct autosa_kernel *kernel, __isl_take isl_union_set *domain)
{
    int i;
    isl_set *grid;
    isl_set *context;
    isl_multi_pw_aff *size;

    /* For AutoSA, we set the grid size as 1 */
    grid = isl_union_set_params(domain);
    grid = isl_set_from_params(grid);
    grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid);
    for (i = 0; i < kernel->n_grid; ++i)
    {
        int pos;
        isl_constraint *ls;

        if (!grid)
            return NULL;

        /* Set this dimension as 1. */
        ls = isl_constraint_alloc_equality(isl_local_space_from_space(isl_set_get_space(grid)));
        ls = isl_constraint_set_constant_si(ls, 0);
        ls = isl_constraint_set_coefficient_si(ls, isl_dim_set, i, 1);
        grid = isl_set_add_constraint(grid, ls);
    }

    grid = isl_set_coalesce(grid);
    size = ppcg_size_from_extent(grid);
    context = isl_set_params(isl_set_copy(kernel->context));
    return isl_multi_pw_aff_gist(size, context);
}

/* Group the domain elements into a single space, named kernelX,
 * with X the kernel sequence number "kernel_id".
 */
static __isl_give isl_schedule_node *group_statements(
    __isl_take isl_schedule_node *node, int kernel_id)
{
    char buffer[20];
    isl_id *id;

    if (!node)
        return NULL;

    snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id);
    id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL);
    return isl_schedule_node_group(node, id);
}

/* Replace "pa" by the zero function defined over the universe domain
 * in the space of "pa".
 */
static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa)
{
    isl_space *space;
    isl_aff *zero;

    space = isl_space_domain(isl_pw_aff_get_space(pa));
    isl_pw_aff_free(pa);
    zero = isl_aff_zero_on_domain(isl_local_space_from_space(space));

    return isl_pw_aff_from_aff(zero);
}

/* The sizes of the arrays on the host that have been computed by
 * extract_array_info may depend on the parameters.  Use the extra
 * constraints on the parameters that are valid at "host_domain"
 * to simplify these expressions and store the results in kernel->array.
 *
 * We only need these localized bounds for arrays that are accessed
 * by the current kernel.  If we have found at least one reference group
 * then the array is accessed by the kernel.
 *
 * The resulting sizes may be functions that are nowhere defined
 * in case the access function cannot possibly access anything inside
 * the kernel for some reason.  If so, they are replaced by the zero
 * function.  Since the access function cannot actually access anything,
 * there is no harm in printing the array sizes as zero.
 */
static void localize_bounds(struct autosa_kernel *kernel)
{
    int i, j;
    isl_set *context;
    isl_set *host_domain = kernel->host_domain;

    context = isl_set_copy(host_domain);
    context = isl_set_params(context);

    for (i = 0; i < kernel->n_array; ++i)
    {
        struct autosa_local_array_info *local = &kernel->array[i];
        isl_multi_pw_aff *bound;
        int n_index;

        if (local->n_pe_group == 0)
            continue;

        n_index = local->array->n_index;
        bound = isl_multi_pw_aff_copy(local->array->bound);

        for (j = 0; j < n_index; ++j)
        {
            isl_pw_aff *pwaff;
            int empty;

            pwaff = isl_multi_pw_aff_get_pw_aff(bound, j);
            pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
            empty = isl_pw_aff_is_empty(pwaff);
            if (empty < 0)
                pwaff = isl_pw_aff_free(pwaff);
            else if (empty)
                pwaff = set_universally_zero(pwaff);
            bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff);
        }

        local->n_index = n_index;
        local->bound = bound;
    }
    isl_set_free(context);
}

/* Apply communication management including:
 * - data allocation
 * - I/O construction
 * - I/O optimization 
 * First, data allocation allocates the on-chip buffers inside PEs.
 * Next, I/O construction builds the I/O system to transfer the data.
 * Lastly, I/O optimization optimizes the I/O system, performing tasks including:
 * - I/O module clustering
 * - L2 I/O buffering
 * - data packing
 */
isl_stat comm_management(struct autosa_kernel *sa, struct autosa_gen *gen)
{
    printf("[AutoSA] Apply communication management.\n");

    sa_io_construct_optimize(sa, gen);

    /* Localize the array bounds using parameters from the host domain. */
    localize_bounds(sa);

    return isl_stat_ok;
}

static struct autosa_kernel *process_kernel_meta_data(struct autosa_kernel *kernel, struct autosa_gen *gen)
{
    isl_schedule_node *node;
    isl_union_set *domain, *expanded;
    int single_statement;
    isl_union_pw_multi_aff *contraction;
    isl_union_map *host_schedule;
    isl_set *host_domain;
    isl_id *id;    
    int n_space_dim;

    node = isl_schedule_get_root(kernel->schedule);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_child(node, 0);

    /* Insert "local" mark before the "array" mark. */
    node = autosa_tree_insert_local_before_array(node);
    if (!node)
        return NULL;

    domain = isl_schedule_node_get_domain(node);
    single_statement = isl_union_set_n_set(domain) == 1;

    /* Prepare some metadata. */
    kernel->single_statement = single_statement;
    kernel->context = extract_context(node, gen->prog);
    contraction = isl_schedule_node_get_subtree_contraction(node);
    kernel->contraction = isl_union_pw_multi_aff_copy(contraction);
    expanded = isl_union_set_copy(domain);
    expanded = isl_union_set_preimage_union_pw_multi_aff(expanded, contraction);
    kernel->expanded_domain = isl_union_set_copy(expanded);
    kernel->arrays = accessed_by_domain(expanded, gen->prog);
    //kernel->id = gen->kernel_id++;
    /* For FPGA, we set grid_size and block_size as 1, i.e. only one thread block 
     * and one thread inside the thread block. */
    kernel->n_grid = 1;
    kernel->block_dim[0] = 1;
    kernel->n_block = 1;
    kernel->grid_dim[0] = 1;
    kernel->grid_size = extract_grid_size(kernel, isl_union_set_copy(domain));
    host_schedule = isl_schedule_node_get_prefix_schedule_union_map(node);
    host_domain = isl_set_from_union_set(isl_union_map_range(host_schedule));
    kernel->host_domain = host_domain;
    kernel->domain = domain;

    /* Make all the host loops atomic so that kernel is only called once. */
    node = autosa_atomic_ancestors(node);

    /* Insert the "kernel" mark. */
    id = isl_id_alloc(gen->ctx, "kernel", kernel);
    node = isl_schedule_node_insert_mark(node, id);
    gen->kernel = kernel;

    if (!single_statement)
        node = group_statements(node, kernel->id);

    /* Insert the PE mark below the space band */
    node = autosa_tree_move_down_to_array(node, kernel->core);
    node = isl_schedule_node_child(node, 0);
    n_space_dim = 0;
    for (int i = 0; i < isl_schedule_node_band_n_member(node); i++)
    {
        if (isl_schedule_node_band_member_get_space_time(node, i) == autosa_loop_space)
        {
            n_space_dim++;
        }
    }
    if (isl_schedule_node_band_n_member(node) > n_space_dim)
        node = isl_schedule_node_band_split(node, n_space_dim);
    node = isl_schedule_node_child(node, 0);
    id = isl_id_alloc(gen->ctx, "pe", NULL);
    node = isl_schedule_node_insert_mark(node, id);
    node = autosa_tree_move_up_to_kernel(node);

    /* Save a copy of copy_schedule. */
    node = autosa_tree_move_down_to_pe(node, kernel->core);
    kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
    kernel->copy_schedule =
        isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
    contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
    kernel->copy_schedule =
        isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
            kernel->copy_schedule, contraction);
    node = autosa_tree_move_up_to_kernel(node);

    /* Delete the local node. */
    node = autosa_tree_move_down_to_local(node, kernel->core);
    node = isl_schedule_node_delete(node);

    node = autosa_tree_move_up_to_kernel(node);

    kernel->schedule = isl_schedule_free(kernel->schedule);
    kernel->schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);

    return kernel;
}

static struct autosa_kernel *optimize_single_array(struct autosa_kernel *kernel, struct autosa_gen *gen) 
{
    cJSON *array_part_json, *array_part_en_json, *array_part_mode_json;
    cJSON *array_part_L2_json, *array_part_L2_en_json, *array_part_L2_mode_json;
    cJSON *latency_json, *latency_en_json, *latency_mode_json;
    cJSON *simd_json, *simd_en_json, *simd_mode_json;
    /* Enable for array partitioning, L2 array partitioning, latency hiding, SIMD. */
    bool pe_opt_en[4];
    char *pe_opt_mode[4];    

    kernel->prog = gen->prog;
    kernel->options = gen->options;    

    /* Create local arrays. */
    kernel = autosa_kernel_create_local_arrays(kernel, gen->prog);
    assert(kernel != NULL);

    /* Update the sparse structures */
    if (gen->options->autosa->block_sparse) {
        autosa_kernel_extract_sparse_info(kernel, gen);
    }

    /* Apply PE optimization. */
    array_part_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "array_part");
    array_part_en_json = cJSON_GetObjectItemCaseSensitive(array_part_json, "enable");
    array_part_mode_json = cJSON_GetObjectItemCaseSensitive(array_part_json, "mode");

    array_part_L2_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "array_part_L2");
    array_part_L2_en_json = cJSON_GetObjectItemCaseSensitive(array_part_L2_json, "enable");
    array_part_L2_mode_json = cJSON_GetObjectItemCaseSensitive(array_part_L2_json, "mode");

    latency_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "latency");
    latency_en_json = cJSON_GetObjectItemCaseSensitive(latency_json, "enable");
    latency_mode_json = cJSON_GetObjectItemCaseSensitive(latency_json, "mode");

    simd_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "simd");
    simd_en_json = cJSON_GetObjectItemCaseSensitive(simd_json, "enable");
    simd_mode_json = cJSON_GetObjectItemCaseSensitive(simd_json, "mode");

    pe_opt_en[0] = array_part_en_json->valueint;
    pe_opt_en[1] = array_part_L2_en_json->valueint;
    pe_opt_en[2] = latency_en_json->valueint;
    pe_opt_en[3] = simd_en_json->valueint;

    pe_opt_mode[0] = array_part_mode_json->valuestring;
    pe_opt_mode[1] = array_part_L2_mode_json->valuestring;
    pe_opt_mode[2] = latency_mode_json->valuestring;
    pe_opt_mode[3] = simd_mode_json->valuestring;

    /* Compute Management */
    compute_management(gen, kernel, pe_opt_en, pe_opt_mode);
    /* Create the autosa_kernel object and attach to the schedule. */
    if (!kernel)    
        return NULL;    

    /* Process meta data */
    kernel = process_kernel_meta_data(kernel, gen);
    
    /* Communication Management */
    comm_management(kernel, gen);    

    return kernel;
}

/* Create an autosa_kernel represents the domain isntances that reach "node" and 
 * insert a mark node pointing to the autosa_kernel before "node".
 *
 * Mark all outer band nodes as atomic to ensure each kernel is only scheduled once.
 * If the domain elements that reach "node" live in more than one space,
 * then group the domain elements into a single space, named kernelX, 
 * with X the kernel sequence numbers.
 *
 * [Space-time transformation]
 * We will first perform space-time transformation to transform the design to 
 * systolic array.
 * [PE optimization]
 * PE optimization is applied next including: array parititioning, latency hiding, 
 * and SIMD vectorization.
 * For array partitioning, the mark "array" is added between the tile and point loops.
 * All the loops below the "array" mark will be mapped to FPGA device at once.
 * For latency hiding, SIMD vectorization, all the generated loops will be marked
 * "latency" and "SIMD".
 * [Communication management]
 * Then we perform comm opt. through: data allocation, I/O construction, and 
 * I/O optimization.
 * 
 * [Ignore below...]
 * The linear branch between the kernel node and "array" mark may also have a 
 * "local" mark. If present, the mapping to local memory is computed at this point. 
 * The "local" mark will be removed at the end of this function.
 *
 * Compute array reference groups for all arrays, set the local array bounds 
 * based on the set of domain instances that reach the kernel node, 
 * check the total amount of shared memory used and compute 
 * all group tilings.
 *
 * We save a copy of the schedule that may influence the mappings to shared or private
 * memory in kernel->copy_schedule.
 *
 * We add copy statements to the schedule tree and create representations for 
 * the local variables in the kernel.
 *
 * We keep a copy of the isl_id that points to the kernel to ensure 
 * that the kernel does not get destroyed if the schedule node 
 * is freed due to some error condition.
 */
static __isl_give isl_schedule_node *compute_and_comm_optimize(
    struct autosa_gen *gen, __isl_take isl_schedule_node *node)
{
    isl_size num_sa = 0;
    struct autosa_kernel **sa_candidates;
    struct autosa_kernel *sa_opt, *kernel;
    isl_schedule *schedule;                       
    char *space_time_mode;
    cJSON *space_time_json, *space_time_mode_json, *n_sa_json, *tuning;

    /* Set up the sched_pos property */
    node = sched_pos_setup(node);

    /* Generate systolic arrays using space-time mapping. */
    schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    sa_candidates = sa_space_time_transform(schedule, gen->prog->scop, &num_sa);
    if (num_sa > 0)
        printf("[AutoSA] %d systolic arrays generated.\n", num_sa);
    else
    {
        printf("[AutoSA] No systolic array generated. Exit now.\n");
        exit(0);
    }

    space_time_json = cJSON_GetObjectItemCaseSensitive(gen->tuning_config, "space_time");
    space_time_mode_json = cJSON_GetObjectItemCaseSensitive(space_time_json, "mode");
    space_time_mode = space_time_mode_json->valuestring;
    
    if (!strcmp(space_time_mode, "auto"))
    {
        /* Space-time transformation is set in AUTO mode. We will pick up
         * one systolic array to proceed based on heuristics. 
         */
        kernel = sa_candidates_smart_pick(sa_candidates, num_sa);
    } else {
        /* Space-time transformation is set in MANUAL mode. We will take the user
         * specification to select one systolic array to proceed.
         */
        isl_union_map *sizes = extract_sizes_from_str(gen->ctx,
                                                      gen->options->autosa->sa_sizes);
        int kernel_id = read_space_time_kernel_id(sizes);
        isl_union_map_free(sizes);
        if (kernel_id < 0)
        {
            /* User hasn't specified which systolic array to choose yet.
             * We will dump out the number of systolic array designs and 
             * exit the program. */
            FILE *fp;
            char *content;
            isl_printer *p_str;
            char *tuning_path;

            tuning = cJSON_CreateObject();
            space_time_json = cJSON_CreateObject();
            n_sa_json = cJSON_CreateNumber(num_sa);
            cJSON_AddItemToObject(space_time_json, "n_kernel", n_sa_json);
            cJSON_AddItemToObject(tuning, "space_time", space_time_json);
            p_str = isl_printer_to_str(gen->ctx);
            p_str = isl_printer_print_str(p_str, gen->options->autosa->output_dir);
            p_str = isl_printer_print_str(p_str, "/tuning.json");
            tuning_path = isl_printer_get_str(p_str);
            fp = fopen(tuning_path, "w");
            free(tuning_path);
            isl_printer_free(p_str);
            content = cJSON_Print(tuning);
            fprintf(fp, "%s", content);
            cJSON_Delete(tuning);
            exit(0);
        }
        else
        {
            kernel = sa_candidates_manual_pick(sa_candidates, num_sa, kernel_id);
        }
    }
        
    /* Dump out the intermediate code if needed */
    if (gen->options->autosa->dump_code) {
        dump_intermediate_code(gen, isl_schedule_copy(kernel->schedule), "space_time");
    }
    
    /* Update the array information */
    TP_extract_array_info(gen, kernel);    
    kernel = optimize_single_array(kernel, gen);
    gen->tuning_progs.push_back(kernel->tuning_program);

    if (kernel) {
        node = isl_schedule_get_root(kernel->schedule);
        node = autosa_tree_move_down_to_kernel(node);
    } else {
        return NULL;
    }

    return node;
}

/* Return a read ("read" is 1) or write access relation for "group"
 * with those accesses removed that are only needed to communicate data
 * within the subtree of the schedule rooted at "node".
 * Furthermore, include the prefix schedule at "node".
 * That is, return a relation of the form
 *
 *	S -> [D -> A]
 *
 * with D the outer schedule dimensions at "node".
 */
static __isl_give isl_union_map *anchored_non_local_accesses(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    __isl_take isl_schedule_node *node, int read)
{
    isl_union_map *access;
    isl_union_map *prefix;

    prefix = isl_schedule_node_get_prefix_schedule_relation(node);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              isl_union_pw_multi_aff_copy(kernel->contraction));
    access = autosa_array_ref_group_access_relation(group, read, !read);
    access = remove_local_accesses_group(kernel, group, access, prefix,
                                         read);
    /* Prefix: S -> D
   * Access: S -> A
   * range_product: S -> [D -> A]
   */
    access = isl_union_map_range_product(prefix, access);

    return access;
}

/* Given an array reference group "group", create a mapping
 *
 *	read[D -> A] -> [D -> A]
 *
 * if "read" is set or
 *
 *	write[D -> A] -> [D -> A]
 *
 * if "read" is not set.
 * D corresponds to the outer tile->depth dimensions of
 * the kernel schedule.
 */
static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx,
                                                    struct autosa_array_ref_group *group, int read)
{
    struct autosa_array_tile *tile;
    isl_space *space;
    isl_id *id;

    tile = autosa_array_ref_group_tile(group);
    space = isl_space_copy(group->array->space);
    space = isl_space_from_range(space);
    space = isl_space_add_dims(space, isl_dim_in, tile->depth);
    space = isl_space_wrap(space);
    space = isl_space_map_from_set(space);

    id = isl_id_alloc(ctx, read ? "read" : "write", group);
    space = isl_space_set_tuple_id(space, isl_dim_in, id);

    return isl_multi_aff_identity(space);
}

/* Add copy statements to the schedule tree of "node"
 * for reading from global memory to local memory (if "read" is set) or
 * for writing back from local memory to global memory
 * (if "read" is not set) for the array reference group "group" that
 * is mapped to local memory.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 *
 * The copies are performed in the order of the corresponding local
 * memory tile.
 * The copy statement instances include a reference to the outer
 * tile->depth dimensions of the kernel schedule for ease of
 * combining them with the group tiling.
 *
 * If we are performing a read from global memory to local memory and
 * if the array involved is not a scalar, then we copy
 * the entire tile to local memory.  This may result in some extra
 * elements getting copied, but it should lead to simpler code
 * (which means that fewer registers may be needed) and less divergence.
 *
 * Otherwise, we only copy the elements that will be read or have been written
 * in the kernel.
 *
 * That is, the extra schedule is of the form
 *
 *	type[D -> A] -> T
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule, A to the global array and T is the corresponding
 * local memory tile.
 *
 * The copying is inserted in the schedule tree through an extension
 * of the form
 *
 *	D -> type[D -> A]
 *
 * where the extra domain elements type[D -> A] are those accessed
 * by the group.  In the case of read from a non-scalar, this set
 * is replaced by the entire local memory tile.
 *
 * If the "unroll_copy_local" option is set, then the AST generator
 * is instructed to unroll the copying code.
 *
 * The extension is inserted before the core computation in case of a read
 * and after the core computation in case of a write.
 */
static __isl_give isl_schedule_node *add_copies_group_local(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    __isl_take isl_schedule_node *node, int read)
{
    struct autosa_array_tile *tile;
    isl_union_map *access;
    isl_union_set *domain;
    isl_multi_aff *ma;
    isl_multi_aff *from_access;
    isl_multi_pw_aff *mpa;
    isl_multi_union_pw_aff *mupa;
    isl_schedule_node *graft;
    isl_union_set *filter;
    int skip;
    int kernel_depth;
    int empty;

    tile = autosa_array_ref_group_tile(group);
    kernel_depth = isl_schedule_node_get_schedule_depth(node);
    node = autosa_tree_move_down_to_depth(node, tile->depth, kernel->core);

    /* S -> [D -> A] 
   * S: domain elements
   * D: prefix schedule dimensions
   * A: access 
   */
    access = anchored_non_local_accesses(kernel, group, node, read);
    empty = isl_union_map_is_empty(access);
    if (empty < 0 || empty)
    {
        isl_union_map_free(access);
        if (empty < 0)
            return isl_schedule_node_free(node);
        return autosa_tree_move_up_to_kernel(node);
    }

    //group->array->global = 1;
    //group->local_array->global = 1;

    /* read[D -> A] -> [D -> A] */
    from_access = create_from_access(kernel->ctx, group, read);

    /* [D -> A] -> T */
    ma = isl_multi_aff_copy(tile->tiling);
    ma = isl_multi_aff_pullback_multi_aff(ma,
                                          isl_multi_aff_copy(from_access));
    mpa = isl_multi_pw_aff_from_multi_aff(ma);
    /* read[D -> A] -> T */
    mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

    /* [D -> A] */
    domain = isl_union_map_range(access);

    if (read && !autosa_array_is_scalar(group->array))
    {
        isl_map *map;
        isl_set *set;
        set = isl_map_domain(isl_map_from_union_map(isl_union_set_unwrap(domain)));
        map = group_tile(group);
        map = isl_map_intersect_domain(map, set);
        domain = isl_union_set_from_set(isl_map_wrap(map));
    }

    /* read[D -> A] */
    domain = isl_union_set_preimage_multi_aff(domain, from_access);
    /* read[D -> A] -> D */
    access = isl_union_set_wrapped_domain_map(domain);
    /* D -> read[D -> A] */
    access = isl_union_map_reverse(access);
    access = isl_union_map_coalesce(access);
    graft = isl_schedule_node_from_extension(access);
    graft = isl_schedule_node_child(graft, 0);
    graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
    if (kernel->options->unroll_copy_shared)
        graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll);

    while (graft && isl_schedule_node_has_parent(graft))
        graft = isl_schedule_node_parent(graft);

    if (read)
    {
        node = isl_schedule_node_graft_before(node, graft);
    }
    else
    {
        node = isl_schedule_node_graft_after(node, graft);
    }

    node = autosa_tree_move_up_to_kernel(node);

    return node;
}

/* Check whether the array reference group "group" is mapped to
 * local memory and, if so, add copy statements to the schedule tree of "node"
 * for reading from global memory to local memory
 * (if "read" is set) or for writing back from local memory
 * to global memory (if "read" is not set) for this group.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 */
static __isl_give isl_schedule_node *add_copies_group(
    struct autosa_kernel *kernel, struct autosa_array_ref_group *group,
    __isl_take isl_schedule_node *node, int read)
{
    enum autosa_group_access_type type;

    type = autosa_cpu_array_ref_group_type(group);
    if (type == AUTOSA_ACCESS_LOCAL)
        return add_copies_group_local(kernel, group, node, read);

    return node;
}

static void create_kernel_var(isl_ctx *ctx,
                              struct autosa_array_ref_group *group,
                              struct autosa_kernel_var *var)
{
    int j;
    struct autosa_array_tile *tile;
    isl_printer *p;

    var->array = group->array;

    var->type = autosa_array_ref_group_type(group);
    tile = autosa_array_ref_group_tile(group);

    p = isl_printer_to_str(ctx);
    p = autosa_array_ref_group_print_name(group, p);
    var->name = isl_printer_get_str(p);
    isl_printer_free(p);

    var->size = isl_vec_alloc(ctx, group->array->n_index);

    for (j = 0; j < group->array->n_index; ++j)
        var->size = isl_vec_set_element_val(var->size, j,
                                            isl_val_copy(tile->bound[j].size));
}

static isl_stat create_kernel_vars(struct autosa_kernel *kernel)
{
    int i, j, n;

    n = 0;
    for (i = 0; i < kernel->n_array; ++i)
    {
        struct autosa_local_array_info *array = &kernel->array[i];

        for (j = 0; j < array->n_group; ++j)
        {
            struct autosa_array_ref_group *group = array->groups[j];
            enum autosa_group_access_type type;

            type = autosa_cpu_array_ref_group_type(group);
            if (type != AUTOSA_ACCESS_GLOBAL)
                ++n;
        }
    }

    kernel->var = isl_calloc_array(kernel->ctx, struct autosa_kernel_var, n);
    if (!kernel->var)
        return isl_stat_error;
    kernel->n_var = n;

    n = 0;
    for (i = 0; i < kernel->n_array; ++i)
    {
        struct autosa_local_array_info *array = &kernel->array[i];

        for (j = 0; j < array->n_group; ++j)
        {
            struct autosa_array_ref_group *group = array->groups[j];
            enum autosa_group_access_type type;

            type = autosa_cpu_array_ref_group_type(group);
            if (type == AUTOSA_ACCESS_GLOBAL)
                continue;
            create_kernel_var(kernel->ctx, group, &kernel->var[n]);
            ++n;
        }
    }

    return isl_stat_ok;
}

/* For each array reference group that is mapped to local memory,
 * add copy statements to the schedule tree of "node"
 * for reading from global memory to local memory
 * and for writing back.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 */
static __isl_give isl_schedule_node *add_copies(struct autosa_kernel *kernel,
                                                __isl_take isl_schedule_node *node)
{
    int i, j;

    for (i = 0; i < kernel->n_array; ++i)
    {
        struct autosa_local_array_info *array = &kernel->array[i];

        for (j = 0; j < array->n_group; ++j)
        {
            struct autosa_array_ref_group *group = array->groups[j];
            node = add_copies_group(kernel, group, node, 1);
            if (!node)
                return NULL;
            node = add_copies_group(kernel, group, node, 0);
            if (!node)
                return NULL;
        }
    }

    return node;
}

/* Add copy-in/out stmts for the default schedule. */
static __isl_give isl_schedule_node *sa_add_copies(
    struct autosa_gen *gen, __isl_take isl_schedule_node *node)
{
    struct autosa_kernel *kernel;
    isl_id *id;
    isl_set *host_domain;
    isl_union_pw_multi_aff *contraction;
    int single_statement;

    id = isl_schedule_node_mark_get_id(node);
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
    host_domain = kernel->host_domain;
    single_statement = kernel->single_statement;

    /* Add the copy statements. */
    node = add_copies(kernel, node);

    if (create_kernel_vars(kernel) < 0)
        node = isl_schedule_node_free(node);

    if (!single_statement)
        node = isl_schedule_node_parent(node);

    isl_id_free(id);

    return node;
}

/* Perform computation and commmunication management to update the 
 * "schedule" for mapping to FPGA.
 *
 * Unlike PPCG, in AutoSA, only one SA kernel is created out of the 
 * original program, which is guaranteed by the previous step.
 * We will insert a context node, create a autosa_kernel for the schedule tree
 * beneath. Nodes for copying arrays in and out of the FPGA device and for
 * initializing and clearing the device are added. 
 *
 * The FPGA code is generated in a context where at least one statement 
 * instance is executed. The corresponding guard is inserted around 
 * the entire schedule.
 */
__isl_give isl_schedule *sa_map_to_device(struct autosa_gen *gen,
                                          __isl_take isl_schedule *schedule)
{
    isl_schedule_node *node;
    isl_set *context;
    isl_set *guard;
    isl_union_set *domain;
    isl_union_map *prefix;
    isl_union_pw_multi_aff *contraction;
    struct autosa_prog *prog;
    isl_schedule *hw_schedule;
    struct autosa_kernel *kernel;
    isl_id *id;
    cJSON *tuning_config = NULL;

    /* Load the tuning configuration file */
    tuning_config = load_tuning_config(gen->options->autosa->config);
    if (!tuning_config)
    {
        isl_schedule_free(schedule);
        printf("[AutoSA] Error: AutoSA configuration file not found: %s\n",
               gen->options->autosa->config);
        exit(1);
    }
    gen->tuning_config = tuning_config;

    context = isl_set_copy(gen->prog->context);
    context = isl_set_from_params(context);
    schedule = isl_schedule_insert_context(schedule, context);

    prog = gen->prog;
    guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
    prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
    guard = isl_set_from_params(guard);

    node = isl_schedule_get_root(schedule);
    isl_schedule_free(schedule);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_child(node, 0);
    domain = isl_schedule_node_get_domain(node);
    contraction = isl_schedule_node_get_subtree_contraction(node);
    domain = isl_union_set_preimage_union_pw_multi_aff(domain,
                                                       isl_union_pw_multi_aff_copy(contraction));
    prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
    prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
                                                              contraction);

    /* Perform compute and comm optimization. */
    node = compute_and_comm_optimize(gen, node);    

    id = isl_schedule_node_mark_get_id(node);
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
    isl_id_free(id);
    schedule = isl_schedule_node_get_schedule(node);    
    /* Generate hw modules in the systolic array. */    
    generate_hw_modules(schedule, gen, kernel);        

    /* Add copy statements for the default schedule (used for correctness verification). */
    node = sa_add_copies(gen, node);

    /* Add copy-in/out statement for transferring data to/from the FPGA device. */
    node = sa_add_to_from_device(node, domain, prefix, gen->prog);
    node = isl_schedule_node_root(node);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_child(node, 0);
    node = isl_schedule_node_insert_guard(node, guard);
    node = isl_schedule_node_child(node, 0);    

    /* Add init/clear device statements. */
    node = sa_add_init_clear_device(node, kernel);

    /* Add drain merge nodes. */
    node = sa_add_drain_merge(node, gen);    

    isl_schedule_free(gen->schedule);
    gen->schedule = isl_schedule_node_get_schedule(node);
    isl_schedule_node_free(node);
    cJSON_Delete(gen->tuning_config);

    return gen->schedule;
}

/* Generate HLS code for "scop" and print it to "p".
 * After generating an AST for the transformed scop as explained below,
 * we call "gen->print" to print the AST in the desired output format 
 * to "p".
 * 
 * If it turns out that it does not make sense to generate SA code, 
 * then we generate CPU code instead.
 * 
 * The declarations of the arrays that are visible outside of the scop
 * are printed outside of the code generated from the schedule,
 * because the generated code may involve a guard around the entire code.
 * 
 * We first compute a schedule that respects the dependences 
 * of the original program and test if the current program can be mapped to sa.
 * If not, we will generate CPU code instead.
 * If the --load-schedule is specified, then the loaded schedule 
 * is used instead of a computed schedule.
 * 
 * For the candidate program, a sequence of optimizations are performed, 
 * including: 
 * - Space-time Transformation
 * - PE Optimization
 *   - Array Partitioning
 *   - Latency Hiding
 *   - SIMD Vectorization
 * - Data Transfer Optimization
 *   - Data Allocation
 *   - I/O Construction
 *   - I/O Optimization
 * 
 * After the array partitioning, we have a program with
 * K
 * |
 * T
 * |
 * P
 * 
 * We add the kernel marker on top.
 * For each iteration of the T band and for each array, we compute
 * the array elements accessed by that iteration, construct a rectangular
 * box around it and shift it to the origin. The result is used
 * as the on-chip memory for the array.
 * 
 * Copying statements are added to this schedule tree.
 * In practice, these are added in front of the P band, but some of them 
 * may get hoisted up to higher levels.
 * 
 * The entire AST is then generated from the single resulting schedule tree.
 * During the generation the subtrees at kernel nodes (K) are saved aside and
 * replaced by kernel calls. The result is printed as host code while the saved
 * subtrees are printed as device code.
 */
static __isl_give isl_printer *generate(__isl_take isl_printer *p,
                                        struct autosa_gen *gen, struct ppcg_scop *scop,
                                        struct ppcg_options *options)
{
    struct autosa_prog *prog;
    isl_ctx *ctx;
    isl_schedule *schedule;
    isl_bool any_sa;

    if (!scop)
        return isl_printer_free(p);

    ctx = isl_printer_get_ctx(p);
    prog = autosa_prog_alloc(ctx, scop);
    if (!prog)
        return isl_printer_free(p);

    gen->prog = prog;
    /* Scheduling */
    schedule = get_schedule(gen);    

    /* The current ISL scheduler is limited and sometimes can't find the 
     * fully permutable loop band correctly.
     * As a temporary hack, here we will try a second time and to merge the 
     * outer band as much as possible.
     */    
    schedule = merge_outer_bands(schedule, gen);    
    //DBGSCHD(stdout, schedule, isl_schedule_get_ctx(schedule));

    /* Legality check */
    isl_bool is_legal = sa_legality_check(schedule, scop);
    if (is_legal < 0 || !is_legal)
    {
        if (is_legal < 0)
            p = isl_printer_free(p);
        else
            p = print_cpu(p, scop, options);
        isl_schedule_free(schedule);
    }
    else
    {
        if (gen->options->autosa->array_contraction) {
            /* If array contraction is enabled, disable isl sink. */
            gen->options->autosa->isl_sink = 0;
        }

        /* Perform opt. stages:
         * Computation Management -> Communication Management     
         */        
        gen->schedule = sa_map_to_device(gen, schedule);        

        /* Generate the AST tree. */
        gen->tree = sa_generate_code(gen, gen->schedule);
        for (int i = 0; i < gen->n_hw_modules; i++)
        {
            if (gen->hw_modules[i]->is_filter == 1 &&
                gen->hw_modules[i]->is_buffer == 1)
            {
                sa_filter_buffer_io_module_generate_code(gen, gen->hw_modules[i]);
            }
            else
            {
                sa_module_generate_code(gen, gen->hw_modules[i]);
            }
        }
        sa_top_module_generate_code(gen);
        for (int i = 0; i < gen->n_drain_merge_funcs; i++)
        {
            sa_drain_merge_generate_code(gen, gen->drain_merge_funcs[i]);
        }
        if (gen->options->autosa->host_serialize)
        {
            for (int i = 0; i < gen->n_hw_modules; i++)
            {
                if (gen->hw_modules[i]->to_mem)
                {
                    sa_host_serialize_generate_code(gen, gen->hw_modules[i]);
                }
            }
        }

        /* Extract loop structure for latency estimation */
        for (int i = 0; i < gen->n_hw_modules; i++)
        {
            sa_extract_loop_info(gen, gen->hw_modules[i]);
        }
        if (options->autosa->tuning_method == 1) {
            /* Extract the information for performance est in the auto tuner. */
            for (int i = 0; i < gen->n_hw_modules; i++) {     
                TP_extract_loop_info(gen, gen->hw_modules[i]);
                TP_extract_resource_info(gen, gen->hw_modules[i]);
                TP_extract_module_attr(gen, gen->hw_modules[i]);
            }        
        }

        /* Dump out the array information */
        sa_extract_array_info(gen->kernel);
        /* Extract design information for resource estimation */
        sa_extract_design_info(gen);

        /* Code generation */        
        p = ppcg_print_exposed_declarations(p, prog->scop);
        p = gen->print(p, gen->prog, gen->tree, gen->hw_modules, gen->n_hw_modules,
                       gen->hw_top_module, gen->drain_merge_funcs, gen->n_drain_merge_funcs,
                       &gen->types, gen->print_user);

        /* Dump tuning information */
        if (options->autosa->tuning_method == 1) {
            std::string params_f(options->autosa->output_dir);
            params_f += "/tuning";
            for (int i = 0; i < gen->tuning_progs.size(); i++) {
                gen->tuning_progs[i]->dump(params_f);
            }
        }

        /* Clean up */
        isl_ast_node_free(gen->tree);
        autosa_kernel_free(gen->kernel);
        for (int i = 0; i < gen->n_hw_modules; i++)
        {
            autosa_hw_module_free(gen->hw_modules[i]);
        }
        free(gen->hw_modules);
        autosa_hw_top_module_free(gen->hw_top_module);
        for (int i = 0; i < gen->n_drain_merge_funcs; i++)
        {
            autosa_drain_merge_func_free(gen->drain_merge_funcs[i]);
        }
        free(gen->drain_merge_funcs);
    }

    autosa_prog_free(prog);

    return p;
}

/* Wrapper around generate for use as a ppcg_transform callback. 
 */
static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
                                             struct ppcg_scop *scop, void *user)
{
    struct autosa_gen *gen = (struct autosa_gen *)user;

    return generate(p, gen, scop, gen->options);
}

/* Transform the code in the file called "input" by replacing 
 * all scops by corresponding HLS code and write the results to "out".
 */
int generate_sa(isl_ctx *ctx, const char *input, FILE *out,
                struct ppcg_options *options,
                __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
                                                 struct autosa_prog *prog, __isl_keep isl_ast_node *trees,
                                                 struct autosa_hw_module **modules, int n_module,
                                                 struct autosa_hw_top_module *module,
                                                 struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                 struct autosa_types *types, void *user),
                void *user)
{
    struct autosa_gen gen;
    int r;
    int i;

    gen.ctx = ctx;
    gen.sizes = extract_sizes_from_str(ctx, options->sizes);
    gen.options = options;
    gen.kernel_id = 0;
    gen.print = print;
    gen.print_user = user;
    gen.types.n = 0;
    gen.types.name = NULL;
    gen.hw_modules = NULL;
    gen.n_hw_modules = 0;
    gen.hw_top_module = NULL;
    gen.drain_merge_funcs = NULL;
    gen.n_drain_merge_funcs = 0;
    gen.schedule = NULL;
    gen.kernel = NULL;
    gen.tuning_config = NULL;    

    r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);    

    isl_union_map_free(gen.sizes);
    for (i = 0; i < gen.types.n; ++i)
        free(gen.types.name[i]);
    free(gen.types.name);    

    return r;
}


================================================
FILE: src/autosa_trans.h
================================================
/* Defines functions for computation management in AutoSA, including:
 * - space-time transformation
 * - array partitionining
 * - latency hiding
 * - SIMD vectorization
 */

#ifndef _AUTOSA_TRANS_H
#define _AUTOSA_TRANS_H

#include <isl/constraint.h>

#include "autosa_common.h"

/* Internal structure for loop tiling in PE optimization.
 */
struct autosa_pe_opt_tile_data
{
    int n_tiled_loop;
    int n_touched_loop;
    int tile_len;
    int *tile_size;
    struct autosa_kernel *sa;
};

int generate_sa(isl_ctx *ctx, const char *input, FILE *out,
                struct ppcg_options *options,
                __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
                                                 struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                                 struct autosa_hw_module **modules, int n_modules,
                                                 struct autosa_hw_top_module *top_module,
                                                 struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                 struct autosa_types *types, void *user),
                void *user);
__isl_give isl_schedule *sa_map_to_device(struct autosa_gen *gen,
                                          __isl_take isl_schedule *schedule);
isl_bool sa_legality_check(__isl_keep isl_schedule *schedule, struct ppcg_scop *scop);

/* Space-Time transformation */
struct autosa_kernel **sa_space_time_transform_at_dim_async(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa);
struct autosa_kernel **sa_space_time_transform_at_dim_sync(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa);
struct autosa_kernel **sa_space_time_transform_at_dim(
    __isl_keep isl_schedule *schedule, struct ppcg_scop *scop,
    isl_size dim, isl_size *num_sa);
struct autosa_kernel *sa_candidates_smart_pick(
    struct autosa_kernel **sa_list, __isl_keep isl_size num_sa);
struct autosa_kernel *sa_candidates_manual_pick(
    struct autosa_kernel **sa_list, isl_size num_sa, int sa_id);
struct autosa_kernel **sa_space_time_transform(
    __isl_take isl_schedule *schedule, struct ppcg_scop *scop, isl_size *num_sa);

/* PE Optimization */
isl_stat sa_array_partitioning_optimize(
    struct autosa_kernel *sa, bool en, char *mode, bool L2_en, char *L2_mode);
isl_stat sa_latency_hiding_optimize(
    struct autosa_kernel *sa, bool en, char *mode);
isl_stat sa_simd_vectorization_optimize(
    struct autosa_kernel *sa, char *mode);
isl_stat compute_management(
    struct autosa_gen *gen,
    struct autosa_kernel *sa, bool pass_en[], char *pass_mode[]);

isl_stat sa_loop_init(struct autosa_kernel *sa);
isl_stat sa_space_time_loop_setup(struct autosa_kernel *sa);

void extract_sa_dims_from_node(__isl_keep isl_schedule_node *node, int *sa_dims, int n_sa_dim);

#endif

================================================
FILE: src/autosa_tuning.cpp
================================================
/* This function defines all the functions used for AutoSA tuning.
 * When executed in the tuning mode, AutoSA will automatically optimize the program,
 * applying different permutation and tiling techniques.
 * The program transform history and program loop structure are recorded, which 
 * are later used by the auto-tuner.
 */
#include <iomanip>
#include <iostream>
#include <fstream>

#include "ppcg.h"
#include "autosa_tuning.h"
#include "autosa_schedule_tree.h"

__isl_give TPExpr *TPExpr::div_by_param(__isl_take TPExpr *divisor) {        
    TPExpr *expr = new TPExpr("div", this, divisor);
    return expr;
}

__isl_give TPExpr *TPExpr::ceil() {    
    TPExpr *expr = new TPExpr("ceil", this);
    return expr;
}

__isl_give TPExpr *TPExpr::add(__isl_take TPExpr *expr) {
    if (this->func == "NULL") {        
        delete this;
        return expr;        
    } else {
        TPExpr *new_expr = new TPExpr("add", this, expr);
        return new_expr;
    }
}

__isl_give TPExpr *TPExpr::mul(__isl_take TPExpr *expr) {   
    if (this->func == "NULL") {
        delete this;
        return expr;
    } else if (this->to_str() == "1") {
        delete this;
        return expr;
    } else if (expr->to_str() == "1") {
        delete expr;
        return this;
    } else {
        TPExpr *new_expr = new TPExpr("mul", this, expr);
        return new_expr;    
    }
}

__isl_give TPExpr *TPExpr::subtract(__isl_take TPExpr *expr) {    
    if (this->func == "literal" && dynamic_cast<TPConst *>(this->ops[0])) {        
        int val = ((TPConst *)(this->ops[0]))->val;
        if (expr->func == "literal" && dynamic_cast<TPConst *>(expr->ops[0])) {
            val -= ((TPConst *)(expr->ops[0]))->val;        
            delete this;
            delete expr;
            return new TPExpr("literal", new TPConst(val));
        }
    } else if (expr->func == "literal" && dynamic_cast<TPConst *>(expr->ops[0])) {
        int val = ((TPConst *)(expr->ops[0]))->val;
        if (val == 0) {
            delete expr;
            return this;
        }        
    }
    TPExpr *new_expr = new TPExpr("sub", this, expr);
    return new_expr;
}

__isl_give TPExpr *TPExpr::min(__isl_take TPExpr *expr) {    
    if (this->func == "literal" && dynamic_cast<TPConst *>(this->ops[0])) {        
        int val = ((TPConst *)(this->ops[0]))->val;
        if (expr->func == "literal" && dynamic_cast<TPConst *>(expr->ops[0])) {
            val = std::min(val, ((TPConst *)(expr->ops[0]))->val);
            delete this;
            delete expr;
            return new TPExpr("literal", new TPConst(val));
        }
    } else if (this->func == "NULL") {
        delete this;
        return expr;
    } else if (this->to_str() == expr->to_str()) {
        delete expr;
        return this;
    }
    TPExpr *new_expr = new TPExpr("min", this, expr);
    return new_expr;
}

__isl_give TPExpr *TPExpr::max(__isl_take TPExpr *expr) {
    if (this->func == "literal" && dynamic_cast<TPConst *>(this->ops[0])) {        
        int val = ((TPConst *)(this->ops[0]))->val;
        if (expr->func == "literal" && dynamic_cast<TPConst *>(expr->ops[0])) {
            val = std::max(val, ((TPConst *)(expr->ops[0]))->val);
            delete this;
            delete expr;
            return new TPExpr("literal", new TPConst(val));
        }
    } else if (this->func == "NULL") {
        delete this;
        return expr;
    } else if (this->to_str() == expr->to_str()) {
        delete expr;
        return this;
    }
    TPExpr *new_expr = new TPExpr("max", this, expr);
    return new_expr;
}

/* Create a duplicate of the current expression. */
__isl_give TPExpr *TPExpr::dup() {
    TPExpr *new_expr = new TPExpr();
    new_expr->func = this->func;
    if (this->func == "literal") {
        TPExpr *op = this->ops[0];
        if (dynamic_cast<TPParameter *>(op)) {            
            new_expr->ops.push_back(((TPParameter *)(op))->dup());
        } else if (dynamic_cast<TPConst *>(op)) {            
            new_expr->ops.push_back(((TPConst *)(op))->dup());            
        }
    } else {
        for (auto op : this->ops) {
            new_expr->ops.push_back(op->dup());
        }
    }
    return new_expr;
}        

__isl_give TPParameter *TPParameter::dup() {
    TPParameter *new_param = new TPParameter();
    new_param->name = this->name;
    new_param->name_prefix = this->name_prefix;
    new_param->type = this->type;
    for (auto bound : this->bounds) {
        new_param->bounds.push_back(std::shared_ptr<TPExpr>(bound->dup()));
    }    
    for (auto d : this->divisors) {
        new_param->divisors.push_back(std::shared_ptr<TPExpr>(d->dup()));
    }    
    for (auto m : this->multiples) {
        new_param->multiples.push_back(std::shared_ptr<TPExpr>(m->dup()));
    }    
    new_param->tune = this->tune;    
    new_param->attr = this->attr; 
    for (auto tag : this->tags) {
        new_param->tags.insert(tag);
    }

    return new_param;
}

__isl_give TPConst *TPConst::dup() {
    TPConst *new_const = new TPConst();
    new_const->type = this->type;
    new_const->val = this->val;

    return new_const; 
}

bool propagate_cst(TPExpr *expr, int cst) {
    bool status = false;
    if (expr->func == "add" || expr->func == "sub") {
        if (expr->ops[1]->func == "add" || expr->ops[1]->func == "sub") {
            status = propagate_cst(expr->ops[1], cst);
        } else if (expr->ops[1]->func == "literal" && dynamic_cast<TPConst *>(expr->ops[1]->ops[0])) {
            int new_cst;
            if (expr->func == "sub") 
                new_cst = dynamic_cast<TPConst *>(expr->ops[1]->ops[0])->val - cst;
            else
                new_cst = dynamic_cast<TPConst *>(expr->ops[1]->ops[0])->val + cst;
            delete expr->ops[1]->ops[0];
            expr->ops[1]->ops[0] = new TPConst(new_cst);
            status = true;
        }
    }
    return status;
}

__isl_give TPExpr *const_propagation(__isl_take TPExpr *expr) {
    TPExpr *ret_expr = expr;
    if (ret_expr->func == "add" || ret_expr->func == "sub") {
        /* Check if const propogation is possible */
        if (ret_expr->ops[1]->func == "literal" && dynamic_cast<TPConst *>(ret_expr->ops[1]->ops[0])) {            
            bool status = propagate_cst(ret_expr->ops[0], dynamic_cast<TPConst *>(ret_expr->ops[1]->ops[0])->val);
            if (status) {                
                TPExpr *new_expr = ret_expr->ops[0]->dup();                
                delete ret_expr;
                ret_expr = new_expr;
            }
        }
        /* Check if there is any zero in the operands. */
        if (ret_expr->ops[1]->func == "literal" && dynamic_cast<TPConst *>(ret_expr->ops[1]->ops[0])) {
            if (dynamic_cast<TPConst *>(ret_expr->ops[1]->ops[0])->val == 0) {
                TPExpr *new_expr = ret_expr->ops[0]->dup();
                delete ret_expr;
                ret_expr = new_expr;
            }
        }        
    }
    for (int i = 0; i < ret_expr->ops.size(); i++) {
        ret_expr->ops[i] = ret_expr->ops[i]->simplify();
    }
    return ret_expr;
}

__isl_give TPExpr *combine_like_terms(__isl_take TPExpr *expr) {
    TPExpr *ret_expr = expr;

    if (ret_expr->func == "add" || ret_expr->func == "sub") {
        /* Try unite like terms */
        //if (ret_expr->ops[0]->func == "mul") {
        //    std::cout << "f1: " << ret_expr->ops[0]->ops[1]->to_str() << std::endl;
        //    std::cout << "f2: " << ret_expr->ops[1]->to_str() << std::endl;
        //}
        if (ret_expr->ops[0]->func == "mul" && 
            (ret_expr->ops[0]->ops[1]->to_str() == ret_expr->ops[1]->to_str())) {
            TPExpr *left = ret_expr->ops[0]->ops[0]->dup();
            TPExpr *right = ret_expr->ops[0]->ops[1]->dup();
            if (ret_expr->func == "add") {
                left = left->add(new TPExpr("literal", new TPConst(1)));
            } else {
                left = left->subtract(new TPExpr("literal", new TPConst(1)));
            }
            TPExpr *new_expr = new TPExpr("mul", left, right);
            delete ret_expr;
            ret_expr = new_expr;
        }
    }

    ret_expr = const_propagation(ret_expr);

    return ret_expr;
}

__isl_give TPExpr *simplify_chain_ops(__isl_take TPExpr *expr) {
    TPExpr *ret_expr = expr;

    if (ret_expr->func == "mul") {
        if (ret_expr->ops[0]->func == "div" &&
            (ret_expr->ops[0]->ops[1]->to_str() == ret_expr->ops[1]->to_str())) {
            TPExpr *new_expr = ret_expr->ops[0]->ops[0]->dup();
            delete ret_expr;
            ret_expr = new_expr;
        }
    }

    return ret_expr;
}

/* Simplify the expression. */
__isl_give TPExpr *TPExpr::simplify() {
    TPExpr *ret_expr = this;
    /* Const propagation */
    ret_expr = const_propagation(ret_expr);
    /* Combine like terms */
    ret_expr = combine_like_terms(ret_expr); 
    /* Simplify chain ops */
    ret_expr = simplify_chain_ops(ret_expr);

    return ret_expr;
}

/* Replace the expression that matches "match" with replace.
 */
__isl_give TPExpr *TPExpr::replace(__isl_keep TPExpr *match, __isl_keep TPExpr *replace) {
    if (this->to_str() == match->to_str()) {
        /* Matched */
        delete this;
        return replace->dup();
    } else {
        if (this->func == "literal") {
            return this;
        } else if (this->func == "floor" || this->func == "ceil") {
            this->ops[0] = this->ops[0]->replace(match, replace);        
            return this;
        } else if (this->func == "div" || this->func == "add" || this->func == "mul" || 
                   this->func == "min" || this->func == "max" || this->func == "sub") {
            this->ops[0] = this->ops[0]->replace(match, replace);
            this->ops[1] = this->ops[1]->replace(match, replace);
            return this;
        } else if (this->func == "NULL") {
            return this;
        } else {
            std::cout << "[AutoSA] Error: TPExpr::replace(): Unsupported TPExpr function type: " << this->func << std::endl;
            exit(1);
        }
    }
}

std::string TPExpr::to_str() {
    if (this->func == "literal") {
        TPExpr *op = this->ops[0];        
        if (dynamic_cast<TPParameter *>(op)) {            
            return ((TPParameter *)(op))->name;
        } else if (dynamic_cast<TPConst *>(op)) {            
            return std::to_string(((TPConst *)(op))->val);
        }
    } else if (this->func == "floor") {        
        std::string ret = "floor(";
        ret += this->ops[0]->to_str();
        ret += ")";
        return ret;
    } else if (this->func == "ceil") {
        std::string ret = "ceil(";
        ret += this->ops[0]->to_str();
        ret += ")";
        return ret;
    } else if (this->func == "div") {
        int single_op = 0;
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        if (r == "1")
            single_op = 1;            
        std::string ret = "";
        if (!single_op)
            ret += "(";
        ret += l;        
        if (r != "1") {
            ret += ("/" + r);
        }
        if (!single_op)
            ret += ")";        
        return ret;
    } else if (this->func == "add") {        
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        std::string ret = "(" + l + "+" + r + ")";
        return ret;
    } else if (this->func == "sub") {        
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        std::string ret = "(" + l + "-" + r + ")";
        return ret;
    } else if (this->func == "mul") {
        int single_op = 0;        
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        if (l == "1" || r == "1")
            single_op = 1;
        std::string ret = "";
        if (!single_op)
            ret += "(";
        if (l != "1")
            ret += l;
        if (l != "1" && r != "1")
            ret += "*";
        if (r != "1")
            ret += r;        
        if (!single_op)
            ret += ")";
        return ret;    
    } else if (this->func == "min") {        
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        std::string ret = "min(" + l + "," + r + ")";
        return ret;
    } else if (this->func == "max") {        
        std::string l = this->ops[0]->to_str();        
        std::string r = this->ops[1]->to_str();
        std::string ret = "max(" + l + "," + r + ")";
        return ret;
    } else if (this->func == "NULL") {
        return "";
    } else {
        std::cout << "[AutoSA] Error: TPExpr::to_str(): Unsupported TPExpr function type: " << this->func << std::endl;
        exit(1);
    }
    return "";
}

std::string TPParameter::to_str() {
    return this->name;
}

__isl_give TPExpr *TPExpr::infer_bound(
    std::unordered_map<std::string, TPExpr *> lbs, 
    std::unordered_map<std::string, TPExpr *> ubs,
    std::unordered_set<std::string> ignore, int max)
{    
    if (this->func == "literal") {
        TPExpr *op = this->ops[0];
        if (dynamic_cast<TPParameter *>(op)) {          
            TPParameter *param = (TPParameter *)(op);            
            if (ignore.find(param->name) != ignore.end()) {
                return new TPExpr("literal", new TPConst(0));
            } else if (lbs.find(param->name) != lbs.end() || ubs.find(param->name) != ubs.end()){
                if (max == 1) {
                    return (ubs[param->name]->dup())->subtract(new TPExpr("literal", new TPConst(1)));
                } else {                    
                    return lbs[param->name]->dup();
                }
            } else {
                return this->dup();
            }
        } else if (dynamic_cast<TPConst *>(op)) {                        
            return this->dup();
        }
    } else if (this->func == "floor") {
        std::cout << "[AutoSA] Error: TPExpr::infer_bound(): Unsupported TPExpr function type: " << this->func << std::endl;
        exit(1);
    } else if (this->func == "ceil") {
        std::cout << "[AutoSA] Error: TPExpr::infer_bound(): Unsupported TPExpr function type: " << this->func << std::endl;
        exit(1);
    } else if (this->func == "div") {
        std::cout << "[AutoSA] Error: TPExpr::infer_bound(): Unsupported TPExpr function type: " << this->func << std::endl;
        exit(1);
    } else if (this->func == "add") {
        TPExpr *left, *right;
        if (max == 1) {
            left = this->ops[0]->infer_bound(lbs, ubs, ignore, 1);
            right = this->ops[1]->infer_bound(lbs, ubs, ignore, 1);
        } else {
            left = this->ops[0]->infer_bound(lbs, ubs, ignore, 0);
            right = this->ops[1]->infer_bound(lbs, ubs, ignore, 0);
        }
        if (left->to_str() == "0" && right->to_str() == "0") {
            delete left;
            delete right;
            return new TPExpr("literal", new TPConst(0));
        } else if (left->to_str() == "0") {
            delete left;
            return right;
        } else if (right->to_str() == "0") {
            delete right;
            return left;
        } else {
            return new TPExpr("add", left, right);
        }
    } else if (this->func == "mul") {
        TPExpr *left, *right;
        if (max == 1) {
            left = this->ops[0]->infer_bound(lbs, ubs, ignore, 1);
            right = this->ops[1]->infer_bound(lbs, ubs, ignore, 1);
        } else {
            left = this->ops[0]->infer_bound(lbs, ubs, ignore, 0);
            right = this->ops[1]->infer_bound(lbs, ubs, ignore, 0);
        }
        if (left->to_str() == "0" || right->to_str() == "0") {
            delete left;
            delete right;
            return new TPExpr("literal", new TPConst(0));
        } else
            return new TPExpr("mul", left, right);
    } else {
        std::cout << "[AutoSA] Error: TPExpr::infer_bound(): Unsupported TPExpr function type: " << this->func << std::endl;
        exit(1);
    }
    return NULL;
}

std::string TPArrayRef::to_str() {
    std::string ret = this->name;
    for (auto index : this->index) {
        ret += ("[" + index->to_str() + "]");                
    }
    return ret;
}

static __isl_give isl_schedule_node *extract_tuning_program_from_schedule(
    __isl_take isl_schedule_node *node, void *user)
{
    if (!node)
        return NULL;
    
    TuningProgram *prog = (TuningProgram *)user;

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band) 
    {        
        int n = isl_schedule_node_band_n_member(node);        
        for (int i = 0; i < n; i++) {            
            /* We assume the loop bounds are independent and 
             * all the loops start from zero for now. 
             */                        
            TPParameter *ub;            
            if (prog->param_names.size() > 0) {
                // Use the pre-assigned parameter names
                ub = new TPParameter(prog->param_names[prog->params.size()], 0);
                //std::cout << prog->params.size() << std::endl;
                //std::cout << prog->param_names[prog->params.size()] << std::endl;
                prog->param_names_cnt[ub->name] = 1;
            } else {
                ub = new TPParameter("p" + std::to_string(prog->params.size()));
            }
            prog->params.push_back(ub);
            prog->param_map[ub->name] = ub;
            ub->tune = false;
            ub->attr = "loop_ub";
            ub->tags.insert("external");

            TPIterator *iter = new TPIterator(
                "c" + std::to_string(prog->iters.size()),                
                new TPExpr("literal", new TPConst(0)),
                new TPExpr("literal", new TPParameter(ub)));            
            // Assign the iterator to schedule dim                        
            node = isl_schedule_node_band_member_set_iter(node, i, (void *)iter);            
            prog->iters.push_back(iter);
        }                
    }

    return node;
}

/* Initialize the tuning program from the schedule. 
 * We will bind all the band dimensions in the schedule with an iterator variable to keep then in track.
 * All the future transformations on the band dimensions will also be recored by the tuning program.
 */
__isl_give isl_schedule *TuningProgram::init_from_schedule(__isl_take isl_schedule *schedule) {
    // Init the iter field to each dim of the schedule tree
    // TODO: Add a legality check.
    // Currently, we require all axis to be independent of each other. And the loop iterators
    // should start from 0.
    isl_schedule_node *root = isl_schedule_get_root(schedule);
    root = isl_schedule_node_map_descendant_bottom_up(root, 
                                                      &extract_tuning_program_from_schedule, this);
    isl_schedule_free(schedule);
    schedule = isl_schedule_node_get_schedule(root);
    isl_schedule_node_free(root);

    return schedule;
}

/* Load the customized parameter names. */
void TuningProgram::load_param_names(char *path) {
    if (path == NULL)
        return;
    std::ifstream i(path);
    json namings;
    i >> namings;
    std::string kernel_name = "kernel" + std::to_string(this->id);    
    auto kernel_names = namings[kernel_name];    
    for (std::string n : kernel_names) {
        this->param_names.push_back(n);        
    }    
}

/* Update the band iters after tiling. The "node" points to the tile band. 
 * Div indicates if the tiling factors should be a divisor of the tiled loop.
 */
__isl_give isl_schedule_node *TuningProgram::tile(__isl_take isl_schedule_node *node, int div, std::string step)
{    
    isl_schedule_node *tile_node = node;
    isl_schedule_node *point_node = isl_schedule_node_child(isl_schedule_node_copy(node), 0);
    int n = isl_schedule_node_band_n_member(point_node);
    for (int i = 0; i < n; i++) {                
        /* We assume all the loops start from zero for now. */
        TPIterator *tile_iter = (TPIterator *)isl_schedule_node_band_member_get_iter(tile_node, i);
        TPParameter *tile_ub = (TPParameter *)(tile_iter->ub->ops[0]);
        /* Check if the parameter name is customized. 
         * If so, following the same naming fashion.
         */
        TPParameter *point_ub;
        if (this->param_names.size() > 0) {
            //std::cout << tile_ub->name_prefix << std::endl;
            point_ub = new TPParameter(tile_ub->name_prefix, this->param_names_cnt[tile_ub->name_prefix]);
            this->param_names_cnt[tile_ub->name_prefix] += 1;
        } else {
            point_ub = new TPParameter("p" + std::to_string(this->params.size()));
        }
        point_ub->tune = true;
        //point_ub->div = div;
        point_ub->bounds.push_back(std::make_shared<TPExpr>("literal", new TPConst(1)));        
        this->param_map[tile_ub->to_str()]->split_by = point_ub;
        point_ub->bounds.push_back(std::make_shared<TPExpr>("literal", new TPParameter(tile_ub)));        
        if (div) {
            point_ub->divisors.push_back(std::make_shared<TPExpr>("literal", new TPParameter(tile_ub)));
        }
        point_ub->attr = step + "_tiling_factor";
        this->params.push_back(point_ub);
        this->param_map[point_ub->name] = point_ub;
                
        // Update the loop bound
        if (div == 0)
            tile_iter->ub = (tile_iter->ub->div_by_param(new TPExpr("literal", new TPParameter(point_ub))))->ceil();
        else
            tile_iter->ub = tile_iter->ub->div_by_param(new TPExpr("literal", new TPParameter(point_ub)));

        // Point loop                        
        TPIterator *point_iter = new TPIterator(
            "c" + std::to_string(this->iters.size()), 
            new TPExpr("literal", new TPConst(0)), 
            new TPExpr("literal", new TPParameter(point_ub)));
        if (isl_schedule_node_band_member_get_space_time(point_node, i) == autosa_loop_space) {
            //std::cout << "iter space: " << point_iter->name << std::endl;
            point_iter->space_time = "space";
        } else {
            point_iter->space_time = "time";        
        }
        point_node = isl_schedule_node_band_member_set_iter(point_node, i, (void *)point_iter);
        this->iters.push_back(point_iter);

        // Update the array indices
        this->update_tiled_arrays(tile_iter, point_iter, point_ub);
    }

    isl_schedule_node_free(tile_node);
    node = isl_schedule_node_parent(point_node);    

    return node;
}

/* Update the band iters after tiling. The "node" points to the tile band. 
 * Dim "pos" in the band is tiled. Point band contains a single loop.
 */
__isl_give isl_schedule_node *TuningProgram::tile(
    __isl_take isl_schedule_node *node, int pos, int div, std::string step, std::unordered_set<std::string> tags, int bound)
{    
    isl_schedule_node *tile_node = node;
    isl_schedule_node *point_node = isl_schedule_node_child(isl_schedule_node_copy(node), 0);    
    TPIterator *tile_iter = (TPIterator *)isl_schedule_node_band_member_get_iter(tile_node, pos);
    //std::cout << step << " " << tile_iter->name << " " << tile_iter->space_time << std::endl;
    TPParameter *tile_ub = (TPParameter *)(tile_iter->ub->ops[0]);
    //TPParameter *point_ub = new TPParameter("p" + std::to_string(this->params.size()));
    TPParameter *point_ub;
    if (this->param_names.size() > 0) {
        point_ub = new TPParameter(tile_ub->name_prefix, this->param_names_cnt[tile_ub->name_prefix]);
        this->param_names_cnt[tile_ub->name_prefix] += 1;
    } else {
        point_ub = new TPParameter("p" + std::to_string(this->params.size()));
    }
    point_ub->tune = true;
    point_ub->bounds.push_back(std::make_shared<TPExpr>("literal", new TPConst(1)));    
    this->param_map[tile_ub->to_str()]->split_by = point_ub;
    point_ub->bounds.push_back(std::make_shared<TPExpr>("literal", new TPParameter(tile_ub)));    
    if (step == "SIMD") {
        point_ub->bounds[1] = std::shared_ptr<TPExpr>(point_ub->bounds[1]->dup()->min(new TPExpr("literal", new TPConst(bound))));
    }

    point_ub->attr = step + "_tiling_factor";
    for (auto tag : tags) {
        point_ub->tags.insert(tag);
    }

    if (div) 
        point_ub->divisors.push_back(std::make_shared<TPExpr>("literal", new TPParameter(tile_ub)));
    this->params.push_back(point_ub);
    this->param_map[point_ub->name] = point_ub;
            
    // Update the loop bound
    if (div == 0)
        tile_iter->ub = (tile_iter->ub->div_by_param(new TPExpr("literal", new TPParameter(point_ub))))->ceil();
    else
        tile_iter->ub = tile_iter->ub->div_by_param(new TPExpr("literal", new TPParameter(point_ub)));

    // Point loop                        
    TPIterator *point_iter = new TPIterator(
        "c" + std::to_string(this->iters.size()), 
        new TPExpr("literal", new TPConst(0)), 
        new TPExpr("literal", new TPParameter(point_ub)));    
    if (isl_schedule_node_band_member_get_space_time(point_node, 0) == autosa_loop_space) {
        point_iter->space_time = "space";
    } else {
        point_iter->space_time = "time";        
    }        
    point_node = isl_schedule_node_band_member_set_iter(point_node, 0, (void *)point_iter);
    this->iters.push_back(point_iter);    

    isl_schedule_node_free(tile_node);
    node = isl_schedule_node_parent(point_node);

    // Update the array indices
    this->update_tiled_arrays(tile_iter, point_iter, point_ub);

    return node;
}

/* Dump out the tuning program information to a JSON file. 
 */
void TuningProgram::dump(std::string dir)
{
    json j;
    // params
    json j_params;
    for (int i = 0; i < this->params.size(); i++) {
        json j_param;
        TPParameter *param = this->params[i];
        j_param["name"] = param->name;       
        if (param->split_by)  {
            j_param["split_by"] = param->split_by->to_str();
        }
        for (auto d : param->divisors) {
            j_param["divisors"].push_back(d->to_str());
        }        
        for (auto m : param->multiples) {
            j_param["multiples"].push_back(m->to_str());
        }
        j_param["tunable"] = param->tune;
        j_param["attr"] = param->attr;    
        if (param->bounds.size() > 0)
            j_param["bounds"] = {param->bounds[0]->to_str(), param->bounds[1]->to_str()};        
        for (auto tag : param->tags) {
            j_param["tags"].push_back(tag);
        }
        j_params.push_back(j_param);
    }
    j["params"] = j_params;

    // loop struct - latency    
    for (auto x: this->module_loop_info) {
        //std::cout << x.first << std::endl;
        j["latency"][x.first] = *x.second;
    }
    
    // design stats - resource
    for (auto x: this->module_memory_info) {
        j["memory"][x.first] = *x.second;
    }
    for (auto x: this->module_compute_info) {        
        j["compute"][x.first] = *x.second;
    }
    for (auto x: this->module_io_info) {
        j["io"][x.first] = *x.second;
    }

    for (auto x: this->module_attr) {
        j["attr"][x.first] = *x.second;
    }

    std::string file_name = dir + "/kernel" + std::to_string(this->id);
    if (this->id2 >= 0) {
        file_name += "_";        
        file_name += std::to_string(this->id2);
    }
    std::ofstream o(file_name + ".json");
    o << std::setw(4) << j << std::endl;
    o.close();    

    return;
}

/* Break all band node into single bands, add a comment marker containing the 
 * corresponding TPIterator pointer.
 */
static __isl_give isl_schedule_node *modify_tuning_schedule(
    __isl_take isl_schedule_node *node, void *user)
{
    if (!node)
        return NULL;

    TuningProgram *program = (TuningProgram *)user;
    isl_ctx *ctx = isl_schedule_node_get_ctx(node);    

    if (isl_schedule_node_get_type(node) == isl_schedule_node_band) {
        int n = isl_schedule_node_band_n_member(node);
        for (int i = n - 1; i >= 0; i--) {
            if (i > 0) {
                node = isl_schedule_node_band_split(node, i);
                node = isl_schedule_node_child(node, 0);
            }
            TPIterator *iter = (TPIterator *)isl_schedule_node_band_member_get_iter(node, 0);
            //if (iter) {
            //    std::cout << iter->name << std::endl;
            //    std::cout << iter->space_time << std::endl;
            //}
            if (iter) {
                isl_id *id = isl_id_alloc(ctx, "iter_info", iter);
                /* Insert it under the current band node. */
                node = isl_schedule_node_child(node, 0);
                node = isl_schedule_node_insert_mark(node, id);
                node = isl_schedule_node_parent(node); // band node
            }
            if (i > 0) {
                node = isl_schedule_node_parent(node);
            }
        }
        //node = isl_schedule_node_parent(node);
    }

    return node;
}

/* This function generates a new schedule used for performance estimation.
 * Specially, all the band dims are broken into single band, and a new mark node is added above 
 * each band, which contains the detailed information of the loop iterator.
 */
__isl_give isl_schedule *TuningProgram::generate_tuning_schedule(__isl_take isl_schedule *schedule) {
    isl_schedule *new_schedule = isl_schedule_dup(schedule);
    isl_schedule_free(schedule);    

    isl_schedule_node *root = isl_schedule_get_root(new_schedule);    
    root = isl_schedule_node_map_descendant_bottom_up(root,
                                                      &modify_tuning_schedule, this);

    isl_schedule_free(new_schedule);
    new_schedule = isl_schedule_node_get_schedule(root);
    isl_schedule_node_free(root);    
    
    return new_schedule;
}

std::shared_ptr<json> extract_isl_ast_node_user(__isl_keep isl_ast_node *node)
{
    isl_ctx *ctx = isl_ast_node_get_ctx(node);
    isl_ast_expr *expr = isl_ast_node_user_get_expr(node);
    isl_printer *p_str = isl_printer_to_str(ctx);
    p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
    p_str = isl_printer_print_ast_expr(p_str, expr);
    char *user_expr = isl_printer_get_str(p_str);
    isl_printer_free(p_str);

    std::shared_ptr<json> info = std::make_shared<json>();
    std::string user_expr_str(user_expr);
    (*info)["user_expr"] = user_expr_str;

    free(user_expr);
    isl_ast_expr_free(expr);

    return info;
}

struct extract_loop_info_data {
    int after_for;
};

std::shared_ptr<json> extract_loop_info(__isl_keep isl_ast_node *node, void *user)
{    
    std::shared_ptr<json> j_info;
    enum isl_ast_node_type type;    
    isl_ctx *ctx = isl_ast_node_get_ctx(node);
    type = isl_ast_node_get_type(node);
    struct extract_loop_info_data *data = (struct extract_loop_info_data *)user;

    switch(type) {
        case isl_ast_node_for:
        {         
            data->after_for = 1;            
            isl_ast_node *child;
            child = isl_ast_node_for_get_body(node);
            std::shared_ptr<json> j_child = extract_loop_info(child, user);
            isl_ast_node_free(child);
            j_info = j_child;            

            break;
        }
        case isl_ast_node_block:
        {
            data->after_for = 0;
            /* Extract the block information and insert it into the loop struc. */
            j_info = std::make_shared<json>();
            *j_info = {{"type", "block"}, {"child", {}}};
            isl_ast_node_list *child_list = isl_ast_node_block_get_children(node);
            int n_child = isl_ast_node_list_n_ast_node(child_list);
            for (int i = 0; i < n_child; i++) {
                isl_ast_node *child = isl_ast_node_list_get_ast_node(child_list, i);
                std::shared_ptr<json> j_child = extract_loop_info(child, user);
                isl_ast_node_free(child);
                (*j_info)["child"].push_back(*j_child);
            }
            isl_ast_node_list_free(child_list);            
            break;
        }
        case isl_ast_node_user:
        {
            data->after_for = 0;
            /* Print nothing. */
            j_info = std::make_shared<json>();
            std::shared_ptr<json> j_user = extract_isl_ast_node_user(node);
            *j_info = {{"type", "user"}, {"child", *j_user}};            
            break;
        }
        case isl_ast_node_if: 
        {
            data->after_for = 0;
            j_info = std::make_shared<json>();
            *j_info = {{"type", "if"}, {"child", {}}};
            isl_ast_node *then_child, *else_child;
            then_child = isl_ast_node_if_get_then_node(node);
            std::shared_ptr<json> j_then = extract_loop_info(then_child, user);
            isl_ast_node_free(then_child);
            (*j_info)["child"].push_back(*j_then);

            else_child = isl_ast_node_if_get_else_node(node);
            if (else_child) {
                std::shared_ptr<json> j_else = extract_loop_info(else_child, user);
                isl_ast_node_free(else_child);
                (*j_info)["child"].push_back(*j_else);
            }            
            break;
        }
        case isl_ast_node_mark: 
        {            
            isl_id *id = isl_ast_node_mark_get_id(node);                        
            TPIterator *iter = NULL;
            if (!strcmp(isl_id_get_name(id), "iter_info")) {
                if (data->after_for == 1) {
                    /* For loop */                
                    isl_ast_node *child = isl_ast_node_mark_get_node(node);
                    data->after_for = 0;
                    std::shared_ptr<json> j_child = extract_loop_info(child, user);
                    isl_ast_node_free(child);
                    iter = (TPIterator *)isl_id_get_user(id);
                    if (iter) {
                        j_info = std::make_shared<json>();
                        *j_info = {{"type", "for"}, {"iterator", iter->name}};
                        (*j_info)["bounds"].push_back(iter->lb->to_str());                
                        (*j_info)["bounds"].push_back(iter->ub->to_str());
                        (*j_info)["child"] = *j_child;
                    } else {
                        j_info = j_child;
                    }  
                } else {
                    /* Skip this one */
                    isl_ast_node *child = isl_ast_node_mark_get_node(node);
                    std::shared_ptr<json> j_child = extract_loop_info(child, user);
                    isl_ast_node_free(child);
                    j_info = j_child;
                }                             
            } else if (!strcmp(isl_id_get_name(id), "tuning_array_tile")) {
                data->after_for = 0;
                /* Print the array information */
                TPArrayTile *tile = (TPArrayTile *)isl_id_get_user(id);
                j_info = std::make_shared<json>();
                *j_info = {{"type", "array_tile"}, {"data_pack_factor", tile->data_pack_factor_inter->name}};
                std::string size = "";
                int is_first = 1;
                for (auto s : tile->sizes) {
                    if (!is_first)
                        size += "*";
                    size += s->to_str();
                    is_first = 0;
                }
                (*j_info)["size"] = size;
                (*j_info)["ele_size"] = tile->ele_size;
                (*j_info)["last_dim"] = tile->sizes[tile->sizes.size() - 1]->to_str();
            } else {
                std::string mark_content(isl_id_get_name(id));
                j_info = std::make_shared<json>();
                *j_info = {{"type", "mark"}, {"content", mark_content}};
                isl_ast_node *child = isl_ast_node_mark_get_node(node);
                data->after_for = 0;
                std::shared_ptr<json> j_child = extract_loop_info(child, user);
                isl_ast_node_free(child);                
                (*j_info)["child"] = *j_child;
            }
            isl_id_free(id);                        

            break;
        }
        default:
        {
            data->after_for = 0;
            break;
        }
    }

    return j_info;
}

/* Extract the loop structure from the "ast", used for latency estimation.
 * TODO: Extract the hw information for resource estimation. 
 */
void TuningProgram::extract_module_loop_info(std::string name, std::vector<isl_ast_node *> &ast) 
{
    if (ast.size() == 0)
        return;
            
    if (ast.size() == 1) {
        std::shared_ptr<json> j_loop;    
        struct extract_loop_info_data data = {0};
        j_loop = extract_loop_info(ast[0], &data);
        this->module_loop_info[name] = j_loop;
    } else if (ast.size() == 3) {        
        // outer module
        std::shared_ptr<json> j_loop1;
        struct extract_loop_info_data data = {0};
        j_loop1 = extract_loop_info(ast[0], &data);
        this->module_loop_info[name] = j_loop1;
        // intra module
        std::shared_ptr<json> j_loop2;
        data.after_for = 0;
        j_loop2 = extract_loop_info(ast[1], &data);
        this->module_loop_info[name + "_intra"] = j_loop2;
        // inter module
        std::shared_ptr<json> j_loop3;
        data.after_for = 0;
        j_loop3 = extract_loop_info(ast[2], &data);
        this->module_loop_info[name + "_inter"] = j_loop3;
    }

    return;
}

void TuningProgram::extract_module_attr(
    std::string name, int double_buffer, int in, int io, int to_dram, int serialize, int to_pe, int filter) {
    std::shared_ptr<json> j = std::make_shared<json>();    
    (*j)["double_buffer"] = double_buffer;
    (*j)["in"] = in;
    (*j)["io"] = io;
    (*j)["to_dram"] = to_dram;
    (*j)["serialize"] = serialize;
    (*j)["to_pe"] = to_pe;
    (*j)["filter"] = filter;

    this->module_attr[name] = j;

    return;
}

struct build_dim_iter_map_data {
    isl_map *ref;
    isl_map *new_ref;
    std::unordered_map<int, TPIterator *> dim_iter_map;  
    TPExpr *dim_expr;
    int done;
};

/* Test if the partial schedule above the "node" matches the "domain".
 * If so, climb the schedule tree and update the mapping between the schedule dimension and the 
 * TPIterator.
 */
__isl_give isl_schedule_node *build_dim_iter_map(__isl_take isl_schedule_node *node, void *user)
{    
    struct build_dim_iter_map_data *data = (struct build_dim_iter_map_data *)user;
    if (data->done)
        return node;

    isl_union_set *domain = isl_schedule_node_get_domain(node);
    isl_union_set *ref_domain = isl_union_set_from_set(isl_map_domain(isl_map_copy(data->ref)));
    if (!isl_union_set_is_empty(domain) && isl_union_set_is_strict_subset(domain, ref_domain)) {                
        isl_union_map *prefix = isl_schedule_node_get_prefix_schedule_relation(node);
        data->new_ref = isl_map_from_union_map(isl_union_map_apply_domain(
            isl_union_map_from_map(isl_map_copy(data->ref)), prefix));            
        data->done = 1; 
        isl_schedule_node *new_node = isl_schedule_node_copy(node);
        while (isl_schedule_node_has_parent(new_node)) {
            if (isl_schedule_node_get_type(new_node) == isl_schedule_node_band) {
                isl_set *new_prefix_sched_domain = 
                    isl_set_from_union_set(isl_union_map_range(isl_schedule_node_get_prefix_schedule_relation(new_node)));
  
                int n = isl_schedule_node_band_n_member(new_node);
                for (int i = 0; i < n; i++) {
                    TPIterator *iter = (TPIterator *)isl_schedule_node_band_member_get_iter(new_node, i);
                    if (iter) {                        
                        data->dim_iter_map[isl_set_dim(new_prefix_sched_domain, isl_dim_set) + i] = iter;
                    }
                }
                isl_set_free(new_prefix_sched_domain);
            }
            new_node = isl_schedule_node_parent(new_node);        
        }
        isl_schedule_node_free(new_node);
    }
    isl_union_set_free(domain);
    isl_union_set_free(ref_domain);  

    return node;
}

isl_stat extract_dim_expr(__isl_take isl_basic_map *bmap, void *user) 
{
    struct build_dim_iter_map_data *data = (struct build_dim_iter_map_data *)user;    
    isl_mat *cst_mat = isl_basic_map_equalities_matrix(
        bmap, isl_dim_in, isl_dim_param, isl_dim_cst, isl_dim_div, isl_dim_out
    );        
    assert(isl_basic_map_dim(bmap, isl_dim_param) == 0);
    assert(isl_basic_map_dim(bmap, isl_dim_div) == 0);
    for (int r = 0; r < isl_mat_rows(cst_mat); r++) {
        isl_val *val = isl_mat_get_element_val(cst_mat, r, 
            isl_basic_map_dim(bmap, isl_dim_in) + isl_basic_map_dim(bmap, isl_dim_param)
            + isl_basic_map_dim(bmap, isl_dim_cst) + isl_basic_map_dim(bmap, isl_dim_div));
        int val_i = isl_val_get_num_si(val);
        isl_val_free(val);
        if (val_i != 1) {
            continue;
        }
        for (int i = 0; i < isl_basic_map_dim(bmap, isl_dim_in); i++) {
            isl_val *val = isl_mat_get_element_val(cst_mat, r, i);
            int val_i = isl_val_get_num_si(val);        
            if (val_i != 0) {
                auto it = data->dim_iter_map.find(i);
                if (it != data->dim_iter_map.end()) {
                    TPIterator *iter = data->dim_iter_map[i];                    
                    TPExpr *expr = new TPExpr(
                        "mul", 
                        new TPExpr("literal", new TPConst(val_i * (-1))), 
                        new TPExpr("literal", new TPParameter(iter->name))
                    );                    
                    data->dim_expr = data->dim_expr->add(expr);                    
                }
            }

            isl_val_free(val);
        }
        for (int i = 0; i < isl_basic_map_dim(bmap, isl_dim_cst); i++) {            
            isl_val *val = isl_mat_get_element_val(cst_mat, r, isl_basic_map_dim(bmap, isl_dim_in) + i);
            int val_i = isl_val_get_num_si(val);
            if (val_i != 0) 
                data->dim_expr = data->dim_expr->add(new TPExpr("literal", new TPConst(val_i * (-1))));            
            isl_val_free(val);
        }
    }

    isl_mat_free(cst_mat);
    isl_basic_map_free(bmap);

    return isl_stat_ok;
}

std::shared_ptr<TPArrayRef> TuningProgram::build_array_ref(
    std::string name, __isl_keep isl_map *ref, __isl_keep isl_schedule *schedule)
{
    // Step 1: Build the mapping between the sched dims to the loop iterators
    // i0 -> c0
    // i1 -> c1
    // i2 -> c2    
    struct build_dim_iter_map_data data;
    data.ref = ref;
    data.done = 0;
    isl_schedule_node *root = isl_schedule_get_root(schedule);
    root = isl_schedule_node_map_descendant_bottom_up(root, &build_dim_iter_map, &data);    
    isl_schedule_node_free(root);
    
    // Step 2: Parse the access map to build the array reference
    // [i0, i1, i2, 1] -> A[i0, i2];
    // class array_ref
    // {
    //   std::string name; // A
    //   std::vector<TPExpr *> index; // [i0, i2]
    // }
    auto tp_ref = std::make_shared<TPArrayRef>();
    tp_ref->name = name;
    int dim = isl_map_dim(ref, isl_dim_out);
    for (int i = 0; i < dim; i++) {
        // Project all the other output dims
        isl_map *ref_dim = isl_map_project_out(isl_map_copy(data.new_ref), isl_dim_out, 0, i);
        ref_dim = isl_map_project_out(ref_dim, isl_dim_out, 1, dim - i - 1);
        TPExpr *dim_expr = new TPExpr();
        data.dim_expr = dim_expr;
        isl_map_foreach_basic_map(ref_dim, &extract_dim_expr, &data);
        isl_map_free(ref_dim);
        tp_ref->index.push_back(data.dim_expr);        
    }
    isl_map_free(data.new_ref);        

    return tp_ref;
}

/* Update the array indices after tiling. 
 * Find the original parameter with the name as "tile_iter", replace it with a new expression
 * tile_iter * tile_factor + point_iter
 */
void TuningProgram::update_tiled_arrays(TPIterator *tile_iter, TPIterator *point_iter, TPParameter *tile_factor)
{    
    for (int i = 0; i < this->arrays.size(); i++) {
        TPArray *arr = this->arrays[i];
        for (int j = 0; j < arr->refs.size(); j++) {
            TPArrayRef *ref = arr->refs[j].get();     
            for (int n = 0; n < ref->index.size(); n++) {
                TPExpr *old_expr = new TPExpr("literal", new TPParameter(tile_iter->name));
                TPExpr *new_expr = new TPExpr("literal", new TPParameter(tile_iter->name));
                new_expr = (new_expr->mul(new TPExpr("literal", new TPParameter(tile_factor))))
                            ->add(new TPExpr("literal", new TPParameter(point_iter->name)));
                ref->index[n] = ref->index[n]->replace(old_expr, new_expr);
                delete old_expr;
                delete new_expr;
            }            
        }
    }    
}

std::vector<TPExpr *> TuningProgram::infer_tiled_array_bound_at_dim(int dim, std::vector<std::shared_ptr<TPArrayRef>> refs, std::vector<TPIterator *> fixed_iters)
{
    TPExpr *lb = new TPExpr();
    TPExpr *ub = new TPExpr();
    std::unordered_map<std::string, TPExpr *> iter_ubs;
    for (auto iter : this->iters) {        
        iter_ubs[iter->name] = iter->ub;
    }
    std::unordered_map<std::string, TPExpr *> iter_lbs;
    for (auto iter : this->iters) {        
        iter_lbs[iter->name] = iter->lb;
    }
    std::unordered_set<std::string> ignore_iters;
    for (auto iter : fixed_iters) {        
        ignore_iters.insert(iter->name);
    }
    for (auto ref : refs) {
        TPExpr *index = ref->index[dim];        
        TPExpr *local_lb = index->infer_bound(iter_lbs, iter_ubs, ignore_iters, 0);        
        TPExpr *local_ub = index->infer_bound(iter_lbs, iter_ubs, ignore_iters, 1);
        lb = lb->min(local_lb);
        ub = ub->max(local_ub);
    }    
    TPExpr *size = (ub->subtract(lb->dup()))->add(new TPExpr("literal", new TPConst(1)));    
    size = size->simplify();
    std::vector<TPExpr *> ret = {lb, size};

    return ret;
}

/* Given the fixed iters, infer the maximal bounds of the tiled array given the refs.
 * Construct a array tile object and return it.
 */
TPArrayTile *TuningProgram::infer_tiled_array_bounds(TPArrayTile *tile, std::vector<std::shared_ptr<TPArrayRef>> refs, std::vector<TPIterator *> fixed_iters)
{        
    std::vector<TPExpr *> lbs;
    std::vector<TPExpr *> sizes;
    int dim = refs[0]->index.size();
    for (int i = 0; i < dim; i++) {
        std::vector<TPExpr *> ret = this->infer_tiled_array_bound_at_dim(i, refs, fixed_iters);
        lbs.push_back(ret[0]);
        sizes.push_back(ret[1]);        
    }    

    tile->lbs = lbs;
    tile->sizes = sizes;

    return tile;
}

std::shared_ptr<TPExpr> TPArrayTile::compute_size() {
    TPExpr *size = new TPExpr();
    for (auto s : this->sizes) {
        size = size->mul(s->dup());
    }
    return std::shared_ptr<TPExpr>(size);
}

std::shared_ptr<TPExpr> TPIterator::compute_size() {
    TPExpr *size = this->ub->dup();
    size = size->subtract(this->lb->dup());    
    return std::shared_ptr<TPExpr>(size);
}

struct mul_space_dim_data {    
    TPExpr *num;
    int after_for;
};

isl_bool mul_space_dim(__isl_keep isl_ast_node *node, void *user) {
    struct mul_space_dim_data *data = (struct mul_space_dim_data *)user;
    if (isl_ast_node_get_type(node) == isl_ast_node_for) {
        data->after_for = 1;        
    } else if (isl_ast_node_get_type(node) == isl_ast_node_mark) {
        isl_id *id = isl_ast_node_mark_get_id(node);
        if (!strcmp(isl_id_get_name(id), "iter_info") and data->after_for) {
            TPIterator *iter = (TPIterator *)isl_id_get_user(id);                        
            if (iter && iter->space_time == "space") {
                data->num = data->num->mul(iter->compute_size().get()->dup());
            }
        }
        isl_id_free(id);
        data->after_for = 0;
    } else {
        data->after_for = 0;
    }
    return isl_bool_true;
}

std::shared_ptr<TPExpr> TuningProgram::extract_module_num(isl_ast_node *tree)
{
    TPExpr *num = new TPExpr("literal", new TPConst(1));
    struct mul_space_dim_data data;
    data.num = num;    
    data.after_for = 0;
    isl_ast_node_foreach_descendant_top_down(tree, &mul_space_dim, &data);
    return std::shared_ptr<TPExpr>(data.num);
}

struct extract_space_dim_data {    
    std::vector<std::shared_ptr<TPExpr>> dims;
    int after_for;
    int after_array;
    int io_level;
};

isl_bool extract_space_dim(__isl_keep isl_ast_node *node, void *user) {
    struct extract_space_dim_data *data = (struct extract_space_dim_data *)user;
    if (isl_ast_node_get_type(node) == isl_ast_node_for) {
        data->after_for = 1;
    } else if (isl_ast_node_get_type(node) == isl_ast_node_mark) {
        isl_id *id = isl_ast_node_mark_get_id(node);
        if (!strcmp(isl_id_get_name(id), "iter_info") and data->after_for) {
            TPIterator *iter = (TPIterator *)isl_id_get_user(id);                        
            if (iter && iter->space_time == "space") {
                data->dims.push_back(std::shared_ptr<TPExpr>(iter->compute_size().get()->dup()));                
            }
        }
        isl_id_free(id);
        data->after_for = 0;
    } else {
        data->after_for = 0;
    }
    return isl_bool_true;
}

std::vector<std::shared_ptr<TPExpr>> TuningProgram::extract_module_dims(isl_ast_node *tree)
{
    struct extract_space_dim_data data;
    data.after_for = 0;
    isl_ast_node_foreach_descendant_top_down(tree, &extract_space_dim, &data);
    return data.dims;
}

isl_bool extract_space_dim_io(__isl_keep isl_ast_node *node, void *user) {
    /* Stop at the io_mark "io_level" */
    struct extract_space_dim_data *data = (struct extract_space_dim_data *)user;    
    if (isl_ast_node_get_type(node) == isl_ast_node_mark) {
        isl_id *id = isl_ast_node_mark_get_id(node);
        if (!strcmp(isl_id_get_name(id), "iter_info")) {            
            TPIterator *iter = (TPIterator *)isl_id_get_user(id);                        
            if (iter && (data->after_array || iter->space_time == "space")) {                                
                data->dims.push_back(std::shared_ptr<TPExpr>(iter->compute_size().get()->dup()));                
            }
        }
        char io_mark[20];
        sprintf(io_mark, "io_L%d", data->io_level);        
        if (!strcmp(isl_id_get_name(id), io_mark)) {
            isl_id_free(id);                          
            return isl_bool_false;
        }        
        if (!strcmp(isl_id_get_name(id), "array")) {
            data->after_array = 1;
        }        
        isl_id_free(id);
    }    
    return isl_bool_true;
}

std::vector<std::shared_ptr<TPExpr>> TuningProgram::extract_module_dims_io(isl_ast_node *tree, int io_level)
{    
    struct extract_space_dim_data data;    
    data.after_for = 0;
    data.after_array = 0;
    data.io_level = io_level;    
    isl_ast_node_foreach_descendant_top_down(tree, &extract_space_dim_io, &data);
    return data.dims;
}

void TuningProgram::extract_module_memory_info(std::string name, int double_buffer, TPArrayTile *tile, 
    std::vector<isl_ast_node *> &asts)
{
    auto j_memory = std::make_shared<json>();
    // Extract number of modules, double buffer, ele_type, ele_size, buffer_size, data_pack_factor
    (*j_memory)["double_buffer"] = double_buffer;
    (*j_memory)["array"] = tile->name;
    (*j_memory)["ele_type"] = tile->type;
    (*j_memory)["ele_size"] = tile->ele_size;    
    (*j_memory)["buf_size"] = tile->compute_size()->to_str();
    if (tile->data_pack_factor_inter)
        (*j_memory)["data_pack_factor_inter"] = tile->data_pack_factor_inter->to_str();
    if (tile->data_pack_factor_intra)
        (*j_memory)["data_pack_factor_intra"] = tile->data_pack_factor_intra->to_str();
    TPExpr *num = new TPExpr("literal", new TPConst(1));
    for (isl_ast_node *ast : asts) {
        num = num->mul(this->extract_module_num(ast).get()->dup());
    }
    (*j_memory)["num"] = num->to_str();
    delete num;
    this->module_memory_info[name] = j_memory;
}

void TuningProgram::extract_module_compute_info(std::string name, std::string arr_type, isl_ast_node *tree)
{
    auto j_compute = std::make_shared<json>();
    // Extract number of modules, unroll factor, array type
    for (auto p : this->params) {
        if (p->attr == "SIMD_tiling_factor")
            (*j_compute)["unroll_factor"] = p->name;
    }
    (*j_compute)["ele_type"] = arr_type;
    std::shared_ptr<TPExpr> num = this->extract_module_num(tree);    
    (*j_compute)["num"] = num->to_str();
    std::vector<std::shared_ptr<TPExpr>> dims = this->extract_module_dims(tree);
    for (auto dim : dims)
        (*j_compute)["dims"].push_back(dim->to_str());
    
    this->module_compute_info[name] = j_compute;
}

void TuningProgram::extract_module_io_info(std::string name, int io_level, std::vector<isl_ast_node *> &asts)
{
    auto j_io = std::make_shared<json>();
    // Extract dims of io modules
    for (isl_ast_node *ast : asts) {
        std::vector<std::shared_ptr<TPExpr>> dims = this->extract_module_dims_io(ast, io_level);
        for (auto dim : dims)
            (*j_io)["dims"].push_back(dim->to_str());
    }
    if ((*j_io)["dims"].size() == 0) {
        TPExpr *num = new TPExpr("literal", new TPConst(1));
        (*j_io)["dims"].push_back(num->to_str());
        delete num;
    }


    this->module_io_info[name] = j_io;
}

================================================
FILE: src/autosa_tuning.h
================================================
#ifndef _AUTOSA_TUNING_H
#define _AUTOSA_TUNING_H

#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <isl/constraint.h>

#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>

#include "json.hpp"
#include "autosa_utils.h"

using json = nlohmann::json;

#if defined(__cplusplus)
extern "C" {
#endif    

//class TPTransformHistory {
//    public:
//        TPTransformHistory(){}
//};

//class TPStatement {
//    public:         
//};

class TPExpr {
    public:
        TPExpr() {func = "NULL";}
        TPExpr(std::string f, TPExpr *op) {
            func = f;
            ops.push_back(op);
        }
        TPExpr(std::string f, TPExpr *op1, TPExpr *op2) {
            func = f;
            ops.push_back(op1);
            ops.push_back(op2);
        }

        TPExpr *div_by_param(TPExpr *divisor);
        TPExpr *ceil();
        TPExpr *add(TPExpr *expr);        
        TPExpr *mul(TPExpr *expr);
        TPExpr *subtract(TPExpr *expr); // TODO
        TPExpr *min(TPExpr *expr);
        TPExpr *max(TPExpr *expr);

        TPExpr *infer_bound(
            std::unordered_map<std::string, TPExpr *> lbs, 
            std::unordered_map<std::string, TPExpr *> ubs,
            std::unordered_set<std::string> ignore, int max);
        TPExpr *simplify();
        TPExpr *replace(TPExpr *match, TPExpr *replace);
        TPExpr *dup();
        virtual std::string to_str();
        
        std::string func; // [floor, ceil, div, literal, mul, null, min, max, sub, add]
        std::vector<TPExpr *> ops;        
        
        virtual ~TPExpr() {            
            for (int i = 0; i < ops.size(); i++) {                
                delete ops[i];
            }            
        }
};

class TPIterator {
    public:
        TPIterator(){}
        TPIterator(std::string n, TPExpr *l, TPExpr *u) {
            name = n;
            lb = l;
            ub = u;
        }
        std::shared_ptr<TPExpr> compute_size();
        std::string name;
        TPExpr *lb;
        TPExpr *ub;     
        std::string space_time;
        ~TPIterator() {
            delete lb;
            delete ub;
        }
};

/* Tunable parameters by the tuner. */
class TPParameter: public TPExpr {
    public:
        TPParameter() {}
        TPParameter(std::string n) {
            name = n;
            type = "param";        
            tune = false;
            split_by = NULL;
        }
        TPParameter(std::string n_prefix, int cnt) {
            if (cnt == 0) {
                name = n_prefix;
            } else {
                /* Tiling factors. */
                name = n_prefix + "_t" + std::to_string(cnt);
            }
            name_prefix = n_prefix;
            type = "param";        
            tune = false;
            split_by = NULL;
        }
        TPParameter(TPParameter *p) {
            name = p->name;
            name_prefix = p->name_prefix;
            type = p->type;            
            tune = p->tune;
            attr = p->attr;                        
            split_by = p->split_by;
        }     
        TPParameter *dup();
        std::string to_str();

        std::string name;
        std::string name_prefix;
        std::string type;        
        std::vector<std::shared_ptr<TPExpr>> bounds;        
        bool tune;
        /* The parameter is divisors of the following exps. */
        std::vector <std::shared_ptr<TPExpr>> divisors; 
        /* The parameter is multiples of the following exps. */
        std::vector <std::shared_ptr<TPExpr>> multiples;    
        TPParameter *split_by;
        /* Other constraint tags for this parameters. 
         * "power_of_two", this parameter should be a power of 2.
         * "auto_infer", this parameter will be auto-inferred by other parameters.
         * "external", this parameter will be provided externally.
         */
        std::unordered_set<std::string> tags;
        std::string attr;
        virtual ~TPParameter(){
            //for (int i = 0; i < bounds.size(); i++)
            //    delete bounds[i];
            //for (int i = 0; i > divisors.size(); i++)
            //    delete divisors[i];
            //for (int i = 0; i > multiples.size(); i++)
            //    delete multiples[i];
        }
};

class TPConst: public TPExpr {
    public:
        TPConst() {}
        TPConst(int v) {
            type = "const";
            val = v;
        }
        TPConst *dup();

        std::string type;
        int val;
};

class TPArrayRef {
    public:
        TPArrayRef(){}
        TPArrayRef(std::string n, std::vector<TPExpr *> idx) {
            name = n;
            for (auto i : idx) {
                index.push_back(i);
            }
        }
        std::string name;
        std::vector<TPExpr *> index;
        std::string to_str();
        ~TPArrayRef() {
            for (auto i : index) {
                delete i;
            }
        }
};

class TPArray {
    public:
        TPArray(){}
        TPArray(std::string n) {name = n;}
        std::string name;
        std::vector<std::shared_ptr<TPArrayRef>> refs;
        ~TPArray() {
            //for (auto ref : refs) 
            //    delete ref;
        }
};

class TPArrayTile {
    public:
        TPArrayTile(){data_pack_factor_inter = NULL; data_pack_factor_intra = NULL;}
        std::string name;
        std::string type;
        int ele_size; 
        std::vector<TPExpr *> lbs;
        std::vector<TPExpr *> sizes;
        TPParameter *data_pack_factor_inter;
        std::shared_ptr<TPExpr> data_pack_factor_intra;
        std::shared_ptr<TPExpr> compute_size();
        ~TPArrayTile() {
            for (auto lb : lbs) {
                delete lb;
            }
            for (auto size : sizes) {
                delete size;
            }
        }
};

class TuningProgram {
    public:
        TuningProgram(){id2 = -1;};
        /* Initialize the tuning program from an ISL schedule */
        __isl_give isl_schedule *init_from_schedule(__isl_take isl_schedule *schedule);
        __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node, int div, std::string step);
        __isl_give isl_schedule_node *tile(
            __isl_take isl_schedule_node *node, int pos, int div, std::string step, std::unordered_set<std::string> tags, int bound);
        void dump(std::string dir);
        __isl_give isl_schedule *generate_tuning_schedule(__isl_take isl_schedule *schedule);
        __isl_give isl_schedule *generate_io_tuning_schedule(__isl_take isl_schedule *schedule, int io_level);
        void extract_module_loop_info(std::string name, std::vector<isl_ast_node *> &tree);
        std::shared_ptr<TPExpr> extract_module_num(isl_ast_node *tree);
        //std::shared_ptr<TPExpr> extract_io_module_num(isl_ast_node *tree, int io_level);
        std::vector<std::shared_ptr<TPExpr>> extract_module_dims(isl_ast_node *tree);
        std::vector<std::shared_ptr<TPExpr>> extract_module_dims_io(isl_ast_node *tree, int io_level);
        void extract_module_memory_info(std::string name, int double_buffer, TPArrayTile *tile, std::vector<isl_ast_node *> &tree);
        void extract_module_compute_info(std::string name, std::string arr_type, isl_ast_node *tree);
        void extract_module_io_info(std::string name, int io_level, std::vector<isl_ast_node *> &tree);
        void extract_module_attr(std::string name, int double_buffer, int in, int io, int to_dram, int serialize, int to_pe, int filter);
        std::shared_ptr<TPArrayRef> build_array_ref(std::string name, __isl_keep isl_map *ref, __isl_keep isl_schedule *);
        void update_tiled_arrays(TPIterator *tile_iter, TPIterator *point_iter, TPParameter *tile_factor);
        TPArrayTile *infer_tiled_array_bounds(TPArrayTile *tile, std::vector<std::shared_ptr<TPArrayRef>> refs, std::vector<TPIterator *> fixed_iters);
        std::vector<TPExpr *> infer_tiled_array_bound_at_dim(int dim, std::vector<std::shared_ptr<TPArrayRef>> refs, std::vector<TPIterator *> fixed_iters);
        TPExpr *infer_array_index_lb(TPExpr *, std::vector<TPIterator *> fixed_iters);
        TPExpr *infer_array_index_ub(TPExpr *, std::vector<TPIterator *> fixed_iters);
        void load_param_names(char *path);

        std::vector<TPIterator *> iters;        
        std::vector<TPParameter *> params;                
        std::vector<TPArray *> arrays;
        // Maps the parameter name to the point in "params"
        std::unordered_map<std::string, TPParameter *> param_map;        
        // kernel id to the tuning program
        int id;
        // second-level id for loop permutation
        int id2;
        std::unordered_map<std::string, std::shared_ptr<json>> module_loop_info;        
        std::unordered_map<std::string, std::shared_ptr<json>> module_memory_info;
        std::unordered_map<std::string, std::shared_ptr<json>> module_compute_info;
        std::unordered_map<std::string, std::shared_ptr<json>> module_io_info;
        std::unordered_map<std::string, std::shared_ptr<json>> module_attr;
        std::vector<std::string> param_names;
        std::unordered_map<std::string, int> param_names_cnt;

        ~TuningProgram() {                        
            for (int i = 0; i < iters.size(); i++)
                delete iters[i];            
            for (int i = 0; i < params.size(); i++)
                delete params[i];     
            for (int i = 0; i < arrays.size(); i++)        
                delete arrays[i];
        }

        // Future use
        //std::unordered_set<TPStatement *> stmts;
        //std::vector<TPTransformHistory *> transform_history;
        //std::unordered_map<TPIterator *, TPIterator *> iter_map;
        //std::unordered_map<TPStatement *, TPStatement *> stmt_map;
};

#if defined(__cplusplus)
}
#endif  

#endif

================================================
FILE: src/autosa_utils.cpp
================================================
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include <stdexcept>
#include <limits>
#include <cmath>

#include <isl/space.h>
#include <barvinok/isl.h>

#include "autosa_utils.h"

__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
{
  if (!str)
    return NULL;
  return isl_union_map_read_from_str(ctx, str);
}

/* Concat the basic maps in the map "el" with the basic map list "user". 
 */
static isl_stat concat_basic_map(__isl_take isl_map *el, void *user)
{
  isl_basic_map_list **bmap_list = (isl_basic_map_list **)(user);
  isl_basic_map_list *bmap_list_sub = isl_map_get_basic_map_list(el);
  if (!(*bmap_list))
  {
    *bmap_list = bmap_list_sub;
  }
  else
  {
    *bmap_list = isl_basic_map_list_concat(*bmap_list, bmap_list_sub);
  }

  isl_map_free(el);
  return isl_stat_ok;
}

/* Extract the basic map list from the union map "umap".
 */
__isl_give isl_basic_map_list *isl_union_map_get_basic_map_list(
    __isl_keep isl_union_map *umap)
{
  isl_map_list *map_list = isl_union_map_get_map_list(umap);
  isl_basic_map_list *bmap_list = NULL;
  isl_map_list_foreach(map_list, &concat_basic_map, &bmap_list);

  isl_map_list_free(map_list);
  return bmap_list;
}

static isl_stat acc_n_basic_map(__isl_take isl_map *el, void *user)
{
  isl_size *n = (isl_size *)(user);
  isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(el);
  *n = *n + isl_basic_map_list_n_basic_map(bmap_list);
  isl_map_free(el);
  isl_basic_map_list_free(bmap_list);
  return isl_stat_ok;
}

/* Return the number of basic maps in the union map "umap".
 */
isl_size isl_union_map_n_basic_map(__isl_keep isl_union_map *umap)
{
  isl_size n = 0;
  isl_map_list *map_list = isl_union_map_get_map_list(umap);
  isl_map_list_foreach(map_list, &acc_n_basic_map, &n);

  isl_map_list_free(map_list);

  return n;
}

__isl_give isl_basic_map *isl_basic_map_from_map(__isl_take isl_map *map)
{
  if (!map)
    return NULL;

  assert(isl_map_n_basic_map(map) == 1);
  isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(map);
  isl_map_free(map);

  isl_basic_map *bmap = isl_basic_map_list_get_basic_map(bmap_list, 0);
  isl_basic_map_list_free(bmap_list);

  return bmap;
}

/* Return a union set containing those elements in the domains
 * of the elements of "mupa" where they are all nonnegative.
 *
 * If there are no elements, then simply return the entire domain.
 */
__isl_give isl_union_set *isl_multi_union_pw_aff_nonneg_union_set(
    __isl_take isl_multi_union_pw_aff *mupa)
{
  int i;
  isl_size n;
  isl_union_pw_aff *upa;
  isl_union_set *nonneg;

  n = isl_multi_union_pw_aff_dim(mupa, isl_dim_set);
  if (n < 0)
    mupa = isl_multi_union_pw_aff_free(mupa);
  if (!mupa)
    return NULL;

  if (n == 0)
    return isl_multi_union_pw_aff_domain(mupa);

  upa = isl_multi_union_pw_aff_get_union_pw_aff(mupa, 0);
  nonneg = isl_union_pw_aff_nonneg_union_set(upa);

  for (i = 1; i < n; ++i)
  {
    isl_union_set *nonneg_i;

    upa = isl_multi_union_pw_aff_get_union_pw_aff(mupa, i);
    nonneg_i = isl_union_pw_aff_nonneg_union_set(upa);

    nonneg = isl_union_set_intersect(nonneg, nonneg_i);
  }

  isl_multi_union_pw_aff_free(mupa);
  return nonneg;
}

/* Compute the set of elements in the domain of "pa" where it is nonnegative 
 * and add this set to "uset".
 */
static isl_stat nonneg_union_set(__isl_take isl_pw_aff *pa, void *user)
{
  isl_union_set **uset = (isl_union_set **)user;

  *uset = isl_union_set_add_set(*uset, isl_pw_aff_nonneg_set(pa));

  return *uset ? isl_stat_ok : isl_stat_error;
}

/* Return a union set containing those elements in the domains
 * of "upa" where it is nonnegative.
 */
__isl_give isl_union_set *isl_union_pw_aff_nonneg_union_set(
    __isl_take isl_union_pw_aff *upa)
{
  isl_union_set *nonneg;

  nonneg = isl_union_set_empty(isl_union_pw_aff_get_space(upa));
  if (isl_union_pw_aff_foreach_pw_aff(upa, &nonneg_union_set, &nonneg) < 0)
    nonneg = isl_union_set_free(nonneg);

  isl_union_pw_aff_free(upa);
  return nonneg;
}

/* Return a union set containing those elements in the domains
 * of the elements of "mupa" where they are all non zero.
 *
 * If there are no elements, then simply return the entire domain.
 */
__isl_give isl_union_set *isl_multi_union_pw_aff_non_zero_union_set(
    __isl_take isl_multi_union_pw_aff *mupa)
{
  int i;
  isl_size n;
  isl_union_pw_aff *upa;
  isl_union_set *non_zero;

  n = isl_multi_union_pw_aff_dim(mupa, isl_dim_set);
  if (n < 0)
    mupa = isl_multi_union_pw_aff_free(mupa);
  if (!mupa)
    return NULL;

  if (n == 0)
    return isl_multi_union_pw_aff_domain(mupa);

  upa = isl_multi_union_pw_aff_get_union_pw_aff(mupa, 0);
  non_zero = isl_union_pw_aff_non_zero_union_set(upa);

  for (i = 1; i < n; ++i)
  {
    isl_union_set *non_zero_i;

    upa = isl_multi_union_pw_aff_get_union_pw_aff(mupa, i);
    non_zero_i = isl_union_pw_aff_nonneg_union_set(upa);

    non_zero = isl_union_set_intersect(non_zero, non_zero_i);
  }

  isl_multi_union_pw_aff_free(mupa);
  return non_zero;
}

/* Compute the set of elements in the domain of "pa" where it is non zero
 * and add this set to "uset".
 */
static isl_stat non_zero_union_set(__isl_take isl_pw_aff *pa, void *user)
{
  isl_union_set **uset = (isl_union_set **)user;
  *uset = isl_union_set_add_set(*uset, isl_pw_aff_non_zero_set(pa));

  return *uset ? isl_stat_ok : isl_stat_error;
}

/* Return a union_set containing those elements in the domains
 * of "upa" where it is non zero.
 */
__isl_give isl_union_set *isl_union_pw_aff_non_zero_union_set(
    __isl_take isl_union_pw_aff *upa)
{
  isl_union_set *non_zero;

  non_zero = isl_union_set_empty(isl_union_pw_aff_get_space(upa));
  if (isl_union_pw_aff_foreach_pw_aff(upa, &non_zero_union_set, &non_zero) < 0)
    non_zero = isl_union_set_free(non_zero);

  isl_union_pw_aff_free(upa);
  return non_zero;
}

/* Print the isl_mat "mat" to "fp".
 */
void print_mat(FILE *fp, __isl_keep isl_mat *mat)
{
  isl_printer *printer = isl_printer_to_file(isl_mat_get_ctx(mat), fp);
  for (int i = 0; i < isl_mat_rows(mat); i++)
  {
    for (int j = 0; j < isl_mat_cols(mat); j++)
    {
      isl_printer_print_val(printer, isl_mat_get_element_val(mat, i, j));
      fprintf(fp, " ");
    }
    fprintf(fp, "\n");
  }
  isl_printer_free(printer);
}

/* Compare the two vectors, return 0 if equal.
 */
int isl_vec_cmp(__isl_keep isl_vec *vec1, __isl_keep isl_vec *vec2)
{
  if (isl_vec_size(vec1) != isl_vec_size(vec2))
    return 1;

  for (int i = 0; i < isl_vec_size(vec1); i++)
  {
    if (isl_vec_cmp_element(vec1, vec2, i))
      return 1;
  }

  return 0;
}

/* Construct the string "<a>_<b>".
 */
char *concat(isl_ctx *ctx, const char *a, const char *b)
{
  isl_printer *p;
  char *s;

  p = isl_printer_to_str(ctx);
  p = isl_printer_print_str(p, a);
  p = isl_printer_print_str(p, "_");
  p = isl_printer_print_str(p, b);
  s = isl_printer_get_str(p);
  isl_printer_free(p);

  return s;
}

bool isl_vec_is_zero(__isl_keep isl_vec *vec)
{
  int n = isl_vec_size(vec);
  for (int i = 0; i < n; i++)
  {
    isl_val *val = isl_vec_get_element_val(vec, i);
    if (!isl_val_is_zero(val))
    {
      isl_val_free(val);
      return false;
    }
    isl_val_free(val);
  }
  return true;
}

int suffixcmp(const char *s, const char *suffix)
{
  int start = strlen(s) - strlen(suffix);
  if (start < 0)
    return 1;
  else
    return strncmp(s + start, suffix, strlen(suffix));
}

/* Add "len" parameters p[i] with identifiers "ids" and intersect "set"
 * with
 *
 *	{ : 0 <= p[i] < size[i] }
 *
 * or an overapproximation.
 */
__isl_give isl_set *add_bounded_parameters_dynamic(
    __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
    __isl_keep isl_id_list *ids)
{
  int i, len;
  unsigned nparam;
  isl_space *space;
  isl_local_space *ls;

  len = isl_multi_pw_aff_dim(size, isl_dim_out);
  nparam = isl_set_dim(set, isl_dim_param);
  set = isl_set_add_dims(set, isl_dim_param, len);

  for (i = 0; i < len; ++i)
  {
    isl_id *id;

    id = isl_id_list_get_id(ids, i);
    set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
  }

  space = isl_space_params(isl_set_get_space(set));
  ls = isl_local_space_from_space(space);
  for (i = 0; i < len; ++i)
  {
    isl_pw_aff *param, *size_i, *zero;
    isl_set *bound;

    param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
                                     isl_dim_param, nparam + i);

    size_i = isl_multi_pw_aff_get_pw_aff(size, i);
    bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
    bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
    set = isl_set_intersect_params(set, bound);

    zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
    bound = isl_pw_aff_ge_set(param, zero);
    set = isl_set_intersect_params(set, bound);
  }
  isl_local_space_free(ls);

  return set;
}

long int convert_pwqpoly_to_int(__isl_keep isl_pw_qpolynomial *to_convert)
{
  isl_ctx *ctx = isl_pw_qpolynomial_get_ctx(to_convert);
  long int ret = -1;
  isl_printer *p;
  char *str;

  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = isl_printer_print_pw_qpolynomial(p, to_convert);
  str = isl_printer_get_str(p);
  isl_printer_free(p);

  /* Check if the string only contains the digits */
  for (int i = 0; i < strlen(str); i++) 
  {
    if (!isdigit(str[i])) {
      throw std::runtime_error("[AutoSA] Error: The pw_qpolynomial contains non-digits.\n");
    }
  }

  ret = atol(str);
  free(str);

  return ret;
}

char *isl_vec_to_str(__isl_keep isl_vec *vec)
{
  isl_printer *p_str;
  p_str = isl_printer_to_str(isl_vec_get_ctx(vec));
  p_str = isl_printer_print_vec(p_str, vec);
  char *ret = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return ret;
}

/* Safe conversion to integer value. */
long isl_val_get_num(__isl_take isl_val *val)
{
  long ret;
  isl_val *denominator = isl_val_get_den_val(val)  ;
  assert(isl_val_is_one(denominator));
  isl_val_free(denominator);
  ret = isl_val_get_num_si(val);
  isl_val_free(val);

  return ret;
}

static isl_stat find_pa_min(__isl_take isl_set *set, __isl_take isl_aff *aff, void *user)
{
  long *min = (long *)user;
  if (isl_aff_is_cst(aff)) {
    *min = std::min(*min, isl_val_get_num(isl_aff_get_constant_val(aff)));
  } else {
    *min = std::numeric_limits<long>::min();
  }
  isl_set_free(set);
  isl_aff_free(aff);
  return isl_stat_ok;
}

long compute_set_min(__isl_keep isl_set *set, int dim)
{
  long min = std::numeric_limits<long>::max();
  isl_pw_aff *pa = isl_set_dim_min(isl_set_copy(set), dim);
  isl_pw_aff_foreach_piece(pa, &find_pa_min, &min);
  isl_pw_aff_free(pa);

  return min;  
}

static isl_stat find_pa_max(__isl_take isl_set *set, __isl_take isl_aff *aff, void *user)
{
  long *max = (long *)user;
  if (isl_aff_is_cst(aff)) {
    *max = std::max(*max, isl_val_get_num(isl_aff_get_constant_val(aff)));
  } else {
    *max = std::numeric_limits<long>::max();
  }
  isl_set_free(set);
  isl_aff_free(aff);
  return isl_stat_ok;
}

long compute_set_max(__isl_keep isl_set *set, int dim)
{
  long max = std::numeric_limits<long>::min();
  isl_pw_aff *pa = isl_set_dim_max(isl_set_copy(set), dim);
  isl_pw_aff_foreach_piece(pa, &find_pa_max, &max);
  isl_pw_aff_free(pa);

  return max;  
}

std::vector<int> get_factors(int x) {
  std::vector<int> factors;
  std::vector<int> large_factors;
  for (int i = 1; i < int(sqrt((float)x) + 1); i++) {
    if (x % i == 0)
      factors.push_back(i);
    if (i * i != x)
      large_factors.push_back((int)(x / i));
  }
  for (int i = large_factors.size() - 1; i >= 0; i--) {
    factors.push_back(large_factors[i]);
  }
  return factors;
}

================================================
FILE: src/autosa_utils.h
================================================
#ifndef _AUTOSA_UTILS_H
#define _AUTOSA_UTILS_H

#include <isl/ast.h>
#include <isl/id.h>
#include <isl/id_to_ast_expr.h>
#include <isl/polynomial.h>

#include <pet.h>

#include <vector>

#include "ppcg.h"
#include "ppcg_options.h"

#if defined(__cplusplus)
extern "C" {
#endif    

__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str);

__isl_give isl_basic_map_list *isl_union_map_get_basic_map_list(
    __isl_keep isl_union_map *umap);
isl_size isl_union_map_n_basic_map(__isl_keep isl_union_map *umap);
__isl_give isl_basic_map *isl_basic_map_from_map(__isl_take isl_map *map);

__isl_give isl_union_set *isl_multi_union_pw_aff_nonneg_union_set(
    __isl_take isl_multi_union_pw_aff *mupa);
__isl_give isl_union_set *isl_union_pw_aff_nonneg_union_set(
    __isl_take isl_union_pw_aff *upa);
__isl_give isl_union_set *isl_multi_union_pw_aff_non_zero_union_set(
    __isl_take isl_multi_union_pw_aff *mupa);
__isl_give isl_union_set *isl_union_pw_aff_non_zero_union_set(
    __isl_take isl_union_pw_aff *upa);

void print_mat(FILE *fp, __isl_keep isl_mat *mat);
int isl_vec_cmp(__isl_keep isl_vec *vec1, __isl_keep isl_vec *vec2);
char *concat(isl_ctx *ctx, const char *a, const char *b);
bool isl_vec_is_zero(__isl_keep isl_vec *vec);
int suffixcmp(const char *s, const char *suffix);

__isl_give isl_set *add_bounded_parameters_dynamic(
    __isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
    __isl_keep isl_id_list *ids);

long int convert_pwqpoly_to_int(__isl_keep isl_pw_qpolynomial *to_convert);

/* Get strings */
char *isl_vec_to_str(__isl_keep isl_vec *vec);

long isl_val_get_num(__isl_take isl_val *val);
long compute_set_min(__isl_keep isl_set *set, int dim);
long compute_set_max(__isl_keep isl_set *set, int dim);

/* Get the factors of the number x. */
std::vector<int> get_factors(int x);

#if defined(__cplusplus)
}
#endif

#endif

================================================
FILE: src/autosa_xilinx_hls_c.cpp
================================================
#include <isl/ctx.h>

#include "autosa_xilinx_hls_c.h"
#include "autosa_common.h"
#include "autosa_comm.h"
#include "autosa_print.h"
#include "autosa_trans.h"
#include "autosa_codegen.h"
#include "autosa_utils.h"

#include <set>

struct print_host_user_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_top_module *top;
};

struct print_hw_module_data
{
  struct hls_info *hls;
  struct autosa_prog *prog;
  struct autosa_hw_module *module;
  /* Used for double buffer codegen. Modify the printed iterator prefix. */
  const char *iterator_prefix;
};

/* Print the includes for Xilinx OpenCL host.  
 */
static void print_xilinx_host_header(FILE *fp)
{
  fprintf(fp, "#include <iostream>\n");
  fprintf(fp, "#include <vector>\n");
  fprintf(fp, "#include <fstream>\n\n");

  fprintf(fp, "#define CL_HPP_CL_1_2_DEFAULT_BUILD\n");
  fprintf(fp, "#define CL_HPP_TARGET_OPENCL_VERSION 120\n");
  fprintf(fp, "#define CL_HPP_MINIMUM_OPENCL_VERSION 120\n");
  fprintf(fp, "#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1\n");
  fprintf(fp, "#define CL_USE_DEPRECATED_OPENCL_1_2_APIS\n\n");

  fprintf(fp, "#include <CL/cl2.hpp>\n");
  fprintf(fp, "#include <CL/cl_ext_xilinx.h>\n\n");

  fprintf(fp, "#define OCL_CHECK(error,call)                                       \\\n");
  fprintf(fp, "    call;                                                           \\\n");
  fprintf(fp, "    if (error != CL_SUCCESS) {                                      \\\n");
  fprintf(fp, "      printf(\"%%s:%%d Error calling \" #call \", error code is: %%d\\n\",  \\\n");
  fprintf(fp, "              __FILE__,__LINE__, error);                            \\\n");
  fprintf(fp, "      exit(EXIT_FAILURE);                                           \\\n");
  fprintf(fp, "    }\n\n");

  fprintf(fp, "std::string xclbin_file_name;\n\n");

  fprintf(fp, "template <typename T>\n");
  fprintf(fp, "struct aligned_allocator\n");
  fprintf(fp, "{\n");
  fprintf(fp, "  using value_type = T;\n");
  fprintf(fp, "  T* allocate(std::size_t num)\n");
  fprintf(fp, "  {\n");
  fprintf(fp, "    void* ptr = nullptr;\n");
  fprintf(fp, "    if (posix_memalign(&ptr,4096,num*sizeof(T)))\n");
  fprintf(fp, "      throw std::bad_alloc();\n");
  fprintf(fp, "    return reinterpret_cast<T*>(ptr);\n");
  fprintf(fp, "  }\n");
  fprintf(fp, "  void deallocate(T* p, std::size_t num)\n");
  fprintf(fp, "  {\n");
  fprintf(fp, "    free(p);\n");
  fprintf(fp, "  }\n");
  fprintf(fp, "};\n\n");

  fprintf(fp, "cl::Program::Binaries import_binary_file()\n");
  fprintf(fp, "{\n");
  fprintf(fp, "    std::cout << \"\\n Loading: \"<< xclbin_file_name.c_str() << \"\\n\";\n");
  fprintf(fp, "    std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);\n");
  fprintf(fp, "    bin_file.seekg (0, bin_file.end);\n");
  fprintf(fp, "    unsigned nb = bin_file.tellg();\n");
  fprintf(fp, "    bin_file.seekg (0, bin_file.beg);\n");
  fprintf(fp, "    char *buf = new char [nb];\n");
  fprintf(fp, "    bin_file.read(buf, nb);\n");
  fprintf(fp, "\n");
  fprintf(fp, "    cl::Program::Binaries bins;\n");
  fprintf(fp, "    bins.push_back({buf,nb});\n");
  fprintf(fp, "    return bins;\n");
  fprintf(fp, "}\n\n");

  fprintf(fp, "std::vector<cl::Device> get_devices() {\n");
  fprintf(fp, "    size_t i;\n");
  fprintf(fp, "    cl_int err;\n");
  fprintf(fp, "    std::vector<cl::Platform> platforms;\n");
  fprintf(fp, "    OCL_CHECK(err, err = cl::Platform::get(&platforms));\n");
  fprintf(fp, "    cl::Platform platform;\n");
  fprintf(fp, "    for (i  = 0 ; i < platforms.size(); i++){\n");
  fprintf(fp, "        platform = platforms[i];\n");
  fprintf(fp, "        OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));\n");
  fprintf(fp, "        if (platformName == \"Xilinx\"){\n");
  fprintf(fp, "            std::cout << \"\\nFound Platform\" << std::endl;\n");
  fprintf(fp, "            std::cout << \"\\nPlatform Name: \" << platformName.c_str() << std::endl;\n");
  fprintf(fp, "            break;\n");
  fprintf(fp, "        }\n");
  fprintf(fp, "    }\n");
  fprintf(fp, "    if (i == platforms.size()) {\n");
  fprintf(fp, "        std::cout << \"Error: Failed to find Xilinx platform\" << std::endl;\n");
  fprintf(fp, "        exit(EXIT_FAILURE);\n");
  fprintf(fp, "    }\n");
  fprintf(fp, "    //Getting ACCELERATOR Devices and selecting 1st such device\n");
  fprintf(fp, "    std::vector<cl::Device> devices;\n");
  fprintf(fp, "    OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));\n");
  fprintf(fp, "    return devices;\n");
  fprintf(fp, "}\n\n");
}

/* Open the host .cpp file and the kernel .h and .cpp files for writing.
 * Add the necessary includes.
 */
static void hls_open_files(struct hls_info *info, const char *input)
{
  char name[PATH_MAX];
  char dir[PATH_MAX];
  int len, len_dir;
  isl_printer *p_str;
  char *file_path;

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/");
  file_path = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  len = ppcg_extract_base_name(name, input);
  /* Add the prefix */
  sprintf(dir, "%s", file_path);
  len_dir = strlen(file_path);

  strcpy(name + len, "_host.cpp");
  strcpy(dir + len_dir, name);
  info->host_c = fopen(dir, "w");
  if (!info->host_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  if (!info->hls)
  {
    /* OpenCL host */
    strcpy(name + len, "_host.hpp");
    strcpy(dir + len_dir, name);
    info->host_h = fopen(dir, "w");
    print_xilinx_host_header(info->host_h);
    fprintf(info->host_c, "#include \"%s\"\n", name);
  }

  strcpy(name + len, "_kernel_modules.cpp");
  strcpy(dir + len_dir, name);
  info->kernel_c = fopen(dir, "w");
  if (!info->kernel_c)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  strcpy(name + len, "_kernel.h");
  strcpy(dir + len_dir, name);
  info->kernel_h = fopen(dir, "w");
  if (!info->kernel_h)
  {
    printf("[AutoSA] Error: Can't open the file: %s\n", dir);
    exit(1);
  }

  fprintf(info->host_c, "#include <assert.h>\n");
  fprintf(info->host_c, "#include <stdio.h>\n");
  if (info->hls)
    fprintf(info->host_c, "#include \"%s\"\n\n", name);

  if (info->hls && !info->hcl)
    fprintf(info->kernel_c, "#include \"%s\"\n", name);

  if (info->hcl) {
    strcpy(name + len, "_hcl_decl.h");
    strcpy(dir + len_dir, name);
    info->hcl_decl = fopen(dir, "w");
    if (!info->hcl_decl) {
      printf("[AutoSA] Error: Can't open the file: %s\n", dir);
      exit(1);
    }
  }

  strcpy(name + len, "_top_gen.cpp");
  strcpy(dir + len_dir, name);
  info->top_gen_c = fopen(dir, "w");

  strcpy(name + len, "_top_gen.h");
  strcpy(dir + len_dir, name);
  info->top_gen_h = fopen(dir, "w");

  fprintf(info->top_gen_c, "#include <isl/printer.h>\n");
  fprintf(info->top_gen_c, "#include \"%s\"\n", name);
    
  fprintf(info->kernel_h, "#include <ap_int.h>\n");
  fprintf(info->kernel_h, "#include <hls_stream.h>\n");
  fprintf(info->kernel_h, "\n");  

  fprintf(info->kernel_h, "#define min(x,y) ((x < y) ? x : y)\n");
  fprintf(info->kernel_h, "#define max(x,y) ((x > y) ? x : y)\n");
  fprintf(info->kernel_h, "\n");  

  free(file_path);
}

/* Close all output files.
 */
static void hls_close_files(struct hls_info *info)
{
  isl_printer *p_str;
  char *complete;
  FILE *f;

  fclose(info->kernel_c);
  fclose(info->kernel_h);
  fclose(info->host_c);
  if (!info->hls)
  {
    fclose(info->host_h);
  }
  fclose(info->top_gen_c);
  fclose(info->top_gen_h);
  if (info->hcl)
    fclose(info->hcl_decl);

  p_str = isl_printer_to_str(info->ctx);
  p_str = isl_printer_print_str(p_str, info->output_dir);
  p_str = isl_printer_print_str(p_str, "/src/completed");
  complete = isl_printer_get_str(p_str);
  isl_printer_free(p_str);
  f = fopen(complete, "w");
  fclose(f);
  free(complete);
}

/* Extract the data pack factors for each I/O buffer allocated for the current
 * I/O group.
 * Only insert the data pack factor that is not found in the current list
 * "data_pack_factors".
 * The list is in ascending order.
 */
static int *extract_data_pack_factors(int *data_pack_factors,
                                      int *n_factor, struct autosa_array_ref_group *group)
{
  /* Test if the group default packing factor needs to be inserted */
  if (group->n_lane > 1)
  {    
    int n_lane = group->n_lane;
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (insert) {
      *n_factor = *n_factor + 1;
      data_pack_factors = (int *)realloc(data_pack_factors,
                                         sizeof(int) * (*n_factor));
      for (int j = *n_factor - 1; j > pos; j--)
      {
        data_pack_factors[j] = data_pack_factors[j - 1];
      }
      data_pack_factors[pos] = n_lane;
    }
  }

  for (int i = 0; i < group->n_io_buffer; i++)
  {
    struct autosa_io_buffer *buf = group->io_buffers[i];
    bool insert = true;
    int pos = 0;
    for (pos = 0; pos < *n_factor; pos++)
    {
      if (buf->n_lane > data_pack_factors[pos])
      {
        if (pos < *n_factor - 1)
        {
          if (buf->n_lane < data_pack_factors[pos + 1])
          {
            // insert @pos+1
            pos++;
            break;
          }
        }
      }
      else if (buf->n_lane == data_pack_factors[pos])
      {
        insert = false;
        break;
      }
    }

    if (!insert)
      continue;

    *n_factor = *n_factor + 1;
    data_pack_factors = (int *)realloc(data_pack_factors,
                                       sizeof(int) * (*n_factor));
    for (int j = *n_factor - 1; j > pos; j--)
    {
      data_pack_factors[j] = data_pack_factors[j - 1];
    }
    data_pack_factors[pos] = buf->n_lane;
  }

  return data_pack_factors;
}

/* Examine the local buffers of each array group. 
 * Extract the data pack factors and build the data types 
 * required by the program. 
 */
static isl_stat print_data_types_xilinx(
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_printer *p;
  struct autosa_kernel *kernel;

  kernel = top->kernel;
  p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_str_new_line(p, "/* Data Type */");

  /* Print the primitive data type. */
  for (int i = 0; i < kernel->n_array; i++) {
    struct autosa_local_array_info *local = &kernel->array[i];
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "typedef ");
    p = isl_printer_print_str(p, local->array->type);
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, local->array->name);
    p = isl_printer_print_str(p, "_t1;");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local = &kernel->array[i];
    int *data_pack_factors = (int *)malloc(sizeof(int));
    int n_factor = 1;
    /* First insert the default data pack factor for the array. */
    data_pack_factors[0] = local->n_lane;    

    /* IO group */
    for (int n = 0; n < local->n_io_group; n++)
    {
      struct autosa_array_ref_group *group = local->io_groups[n];
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, group);
    }
    /* Drain group */
    if (local->drain_group)
      data_pack_factors = extract_data_pack_factors(data_pack_factors, &n_factor, local->drain_group);

    if (local->is_sparse) {
      std::set<int> tmp_lanes;
      for (int n = 0; n < n_factor; n++) {
        tmp_lanes.insert(data_pack_factors[n] * kernel->n_nzero);
        tmp_lanes.insert(data_pack_factors[n]);
      }
      for (auto it = tmp_lanes.begin(); it != tmp_lanes.end(); ++it) {
        int f = *it;
        if (local->array->size * 8 * f > 1024) {
          printf("[AutoSA] Warning: The data width %d is greater than 1024-bit. The type definition is not generated.\n", local->array->size * 8 * f);
          continue;
        }
        if (f > 1) {
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "typedef ap_uint<");
          p = isl_printer_print_int(p, local->array->size * 8 * f);
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, f);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }

      for (int n = 0; n < n_factor; n++) {
        if (data_pack_factors[n] * kernel->n_nzero * local->array->size * 8 > 1024)
          continue;
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "typedef struct ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, " {");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, 2);
        
        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, local->array->type);
        } else {
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n] * kernel->n_nzero);
        }
        p = isl_printer_print_str(p, " d;");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        if (data_pack_factors[n] == 1 && kernel->n_nzero == 1) {
          p = isl_printer_print_str(p, "unsigned char");  
        } else {
          p = isl_printer_print_str(p, "ap_uint<");
          p = isl_printer_print_int(p, 8 * data_pack_factors[n]);
          p = isl_printer_print_str(p, ">");
        }
        p = isl_printer_print_str(p, " i;");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "} ");
        p = isl_printer_print_str(p, local->array->name);
        p = isl_printer_print_str(p, "_s_t");
        p = isl_printer_print_int(p, data_pack_factors[n]);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    } else {
      for (int n = 0; n < n_factor; n++)
      {
        if (data_pack_factors[n] != 1)
        {
          int width;
          width = local->array->size * 8 * data_pack_factors[n];
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "typedef ap_uint<");
          p = isl_printer_print_int(p, width);
          p = isl_printer_print_str(p, "> ");
          p = isl_printer_print_str(p, local->array->name);
          p = isl_printer_print_str(p, "_t");
          p = isl_printer_print_int(p, data_pack_factors[n]);
          p = isl_printer_print_str(p, ";");
          p = isl_printer_end_line(p);
        }
      }
    }
    free(data_pack_factors);    
  }
  p = print_str_new_line(p, "/* Data Type */");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *find_device_xilinx(__isl_take isl_printer *p)
{
  p = print_str_new_line(p, "if (argc != 2) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "std::cout << \"Usage: \" << argv[0] << \" <XCLBIN File>\" << std::endl;");
  p = print_str_new_line(p, "return EXIT_FAILURE;");
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "cl_int err;");
  p = print_str_new_line(p, "std::vector<cl::Device> devices = get_devices();");
  p = print_str_new_line(p, "cl::Device device = devices[0];");
  p = print_str_new_line(p, "std::string device_name = device.getInfo<CL_DEVICE_NAME>();");
  p = print_str_new_line(p, "std::cout << \"Found Device=\" << device_name.c_str() << std::endl;");
  p = print_str_new_line(p, "// Creating Context and Command Queue for selected device");
  p = print_str_new_line(p, "cl::Context context(device);");
  p = print_str_new_line(p, "cl::CommandQueue q(context, device);");
  p = print_str_new_line(p, "// Import XCLBIN");
  p = print_str_new_line(p, "xclbin_file_name = argv[1];");
  p = print_str_new_line(p, "cl::Program::Binaries kernel_bins = import_binary_file();");
  p = print_str_new_line(p, "// Create Program and Kernel");
  p = print_str_new_line(p, "//devices.erase(devices.begin());");
  p = print_str_new_line(p, "devices.resize(1);");
  p = print_str_new_line(p, "cl::Program program(context, devices, kernel_bins);");
  p = print_str_new_line(p, "cl::Kernel krnl(program, \"kernel0\");");

  p = isl_printer_end_line(p);

  return p;
}

static __isl_give isl_printer *declare_and_allocate_device_arrays_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog, 
    struct autosa_kernel *kernel, struct autosa_hw_top_module *top)
{
  p = print_str_new_line(p, "// Allocate memory in host memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    //if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
    if (local_array->n_mem_ports > 1)
    {
      /* Create multiple host buffers. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">>> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");      
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".push_back(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (local_array->host_serialize) {
        /* Allocate additional serialize buffer. */
        /* Create multiple host buffers. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">>> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
      
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp");
        p = isl_printer_print_str(p, "(");        
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, ".push_back(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");        
      }
    }
    else
    {
      /* Create a single host buffer. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ", aligned_allocator<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, ">> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        /* Create a single host buffer. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ", aligned_allocator<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, ">> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);      
        p = isl_printer_print_str(p, "(");
        //p = autosa_array_info_print_data_size(p, local_array->array);
        //p = isl_printer_print_ast_expr(p, local_array->serialize_bound_expr);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  /* Initialize buffer. */
  p = print_str_new_line(p, "// Initialize host buffers");

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "[i]");
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else if (local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::copy(reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, "), reinterpret_cast<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *>(");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ") + ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, ", dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".begin());");
      p = isl_printer_end_line(p);
    }    
  }

  /* Perform data serialization if needed. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        p = isl_printer_start_line(p);        
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);            
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);  // TODO: add hbm support later.
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      } else 
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Allocate buffers in device memory");
  p = print_str_new_line(p, "// Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and");
  p = print_str_new_line(p, "// device-to-host communication");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "std::vector<cl::Buffer> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    int indent1, indent2;
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    //for (int j = 0; j < local_array->n_mem_ports; j++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = print_str_new_line(p, "OCL_CHECK(err,");
    indent1 = strlen("OCL_CHECK(");
    p = isl_printer_indent(p, indent1);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "cl::Buffer buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp");
    p = isl_printer_print_str(p, "(context,");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, strlen("cl::Buffer buffer_") +
                                  strlen(local_array->array->name) + strlen("_tmp") + 1);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "CL_MEM_USE_HOST_PTR | ");
    if (local_array->array->copy_in && local_array->array->copy_out)
    {
      p = isl_printer_print_str(p, "CL_MEM_READ_WRITE");
    }
    else
    {
      if (local_array->array->copy_in)
        p = isl_printer_print_str(p, "CL_MEM_READ_ONLY");
      else if (local_array->array->copy_out)
        p = isl_printer_print_str(p, "CL_MEM_WRITE_ONLY");
    }
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_size(p, local_array->array);
    } else {
      p = autosa_array_info_print_size(p, local_array->array);
    }
    p = isl_printer_print_str(p, ",");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "dev_");
    p = isl_printer_print_str(p, local_array->array->name);
    if (local_array->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, "[i]");
    }
    p = isl_printer_print_str(p, ".data(),");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "&err));");
    p = isl_printer_indent(p, -(strlen("cl::Buffer buffer_") +
                                strlen(local_array->array->name) + strlen("_tmp") + 1));
    p = isl_printer_indent(p, -indent1);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ".push_back(std::move(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp));");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  p = isl_printer_end_line(p);

  /* Insert profiling information. */
  p = print_str_new_line(p, "auto host_begin = std::chrono::high_resolution_clock::now();");
  p = print_str_new_line(p, "auto fpga_begin = std::chrono::high_resolution_clock::now();");
  p = print_str_new_line(p, "auto fpga_end = std::chrono::high_resolution_clock::now();");
  p = isl_printer_end_line(p);

  return p;
}

static __isl_give isl_printer *declare_and_allocate_cpu_arrays_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog, 
    struct autosa_kernel *kernel, struct autosa_hw_top_module *top)
{
  p = print_str_new_line(p, "// Allocate memory in host memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1)
    {
      /* Create multiple host buffers. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "std::vector<");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *> ");
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize) {
        p = isl_printer_print_str(p, "_unserialized");
      }
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp");
      p = isl_printer_print_str(p, " = (");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *)malloc(");
      p = autosa_array_info_print_data_size(p, local_array->array);      
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ".push_back(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "_tmp);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");

      if (local_array->host_serialize) {
        /* Allocate additional serialize buffer. */
        /* Create multiple host buffers. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::vector<");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *> ");
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);      
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp");
        p = isl_printer_print_str(p, " = (");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *)malloc(");
        //p = autosa_array_info_print_data_size(p, local_array->array);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, " * sizeof(");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);

        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, ".push_back(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_tmp);");
        p = isl_printer_end_line(p);

        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      }
    }
    else
    {
      /* Create a single host buffer. */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, " = (");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, " *)malloc(");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        /* Create a single host buffer. */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *dev_");
        p = isl_printer_print_str(p, local_array->array->name);       
        p = isl_printer_print_str(p, " = (");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, " *)malloc(");
        //p = autosa_array_info_print_data_size(p, local_array->array);
        p = isl_printer_print_pw_qpolynomial(p, local_array->serialize_bound);
        if (local_array->is_sparse) {
          p = isl_printer_print_str(p, " / ");
          p = isl_printer_print_double(p, (double)local_array->eff_compress_ratio);
        }
        p = isl_printer_print_str(p, " * sizeof(");
        p = isl_printer_print_str(p, local_array->array->type);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
      }
    }
    //    p = isl_printer_print_str(p, " = (");
    //    p = autosa_print_array_type(p, array);
    //    p = isl_printer_print_str(p, " *)malloc(");
    //    p = autosa_array_info_print_data_size(p, array);
    //    p = isl_printer_print_str(p, " * sizeof(");
    //    p = isl_printer_print_str(p, array->type);
    //    p = isl_printer_print_str(p, "));");
    //    p = isl_printer_end_line(p);
  }
  p = isl_printer_end_line(p);

  /* Initialize buffer. */
  p = print_str_new_line(p, "// Initialize host buffers");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "memcpy(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, "[i]");      
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ", ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else if (local_array->array->copy_in)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "memcpy(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->host_serialize)
        p = isl_printer_print_str(p, "_unserialized");
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, local_array->array->name);
      if (local_array->is_sparse)
        p = isl_printer_print_str(p, "_s");
      p = isl_printer_print_str(p, ", ");
      p = autosa_array_info_print_data_size(p, local_array->array);
      p = isl_printer_print_str(p, " * sizeof(");
      p = isl_printer_print_str(p, local_array->array->type);
      p = isl_printer_print_str(p, "));");
      p = isl_printer_end_line(p);
    }
  }
  
  /* Perform data serialization if needed. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      if (local_array->n_mem_ports > 1 && local_array->array->copy_in)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "for (int i = 0; i < ");
        p = isl_printer_print_int(p, local_array->n_mem_ports);
        p = isl_printer_print_str(p, "; i++) {");
        p = isl_printer_end_line(p);
        p = isl_printer_indent(p, 2);
  
        p = isl_printer_start_line(p);        
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);            
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);  // TODO: add hbm support later.
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
  
        p = isl_printer_indent(p, -2);
        p = print_str_new_line(p, "}");
      } else 
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, module->in? "host_serialize_" : "host_deserialize_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "(");
        p = print_host_serialize_arguments(p, kernel, group, module, 0, 0);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }  
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "// Allocate buffers in device memory");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "std::vector<");
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *> buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);

    if (prog->scop->options->autosa->axi_stream) {
      if (local_array->n_mem_ports > 1) {
        printf("[AutoSA] Error: Can't generate AXI Stream interface for array with more than one memory port: %s\n", local_array->array->name);
        exit(1);
      }
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "hls::stream<");
      p = autosa_print_array_type(p, local_array->array);
      p = isl_printer_print_str(p, "> fifo_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    int indent1, indent2;
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp = (");
    p = autosa_print_array_type(p, local_array->array);
    p = isl_printer_print_str(p, " *)malloc(");
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_size(p, local_array->array);
    } else {
      p = autosa_array_info_print_size(p, local_array->array);
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, ".push_back(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "_tmp);");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  p = isl_printer_end_line(p);

  return p;
}

/* Print code for initializing the device for execution of the transformed
 * code. This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device_xilinx(__isl_take isl_printer *p,
                                                  struct autosa_prog *prog, 
                                                  struct autosa_kernel *kernel, 
                                                  int hls,
                                                  struct autosa_hw_top_module *top)
{
  p = autosa_print_local_declarations(p, prog);
  if (!hls)
  {
    p = find_device_xilinx(p);
    p = declare_and_allocate_device_arrays_xilinx(p, prog, kernel, top);
  }
  else
  {
    p = declare_and_allocate_cpu_arrays_xilinx(p, prog, kernel, top);
  }

  return p;
}

static __isl_give isl_printer *autosa_free_cpu_arrays_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog, struct autosa_kernel *kernel)
{
  p = print_str_new_line(p, "// Clean up resources");
  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "free(buffer_");
    p = isl_printer_print_str(p, local_array->array->name);
    p = isl_printer_print_str(p, "[i]);");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  for (int i = 0; i < kernel->n_array; i++)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (!autosa_array_requires_device_allocation(local_array->array))
      continue;

    if (local_array->n_mem_ports > 1)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int i = 0; i < ");
      p = isl_printer_print_int(p, local_array->n_mem_ports);
      p = isl_printer_print_str(p, "; i++) {");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, 2);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "free(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "[i]);");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "free(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_unserialized");
        p = isl_printer_print_str(p, "[i]);");
        p = isl_printer_end_line(p);
      }

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }
    else
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "free(dev_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, ");");
      p = isl_printer_end_line(p);

      if (local_array->host_serialize) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "free(dev_");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "_unserialized");
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
  }

  //  for (int i = 0; i < prog->n_array; i++) {
  //    struct autosa_array_info *array = &prog->array[i];
  //    if (!autosa_array_requires_device_allocation(&prog->array[i]))
  //      continue;
  //
  //    p = isl_printer_start_line(p);
  //    p = isl_printer_print_str(p, "free(dev_");
  //    p = isl_printer_print_str(p, array->name);
  //    p = isl_printer_print_str(p, ");");
  //    p = isl_printer_end_line(p);
  //  }

  return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device_xilinx(__isl_take isl_printer *p,
                                                   struct autosa_prog *prog, 
                                                   struct autosa_kernel *kernel, 
                                                   int hls,
                                                   struct autosa_hw_top_module *top)
{
  if (!hls)
  {
    /* Profiling results */
    p = print_str_new_line(p, "q.finish();");
    p = print_str_new_line(p, "auto host_end = std::chrono::high_resolution_clock::now();");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "// Calculate time");
    p = print_str_new_line(p, "std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;");
    p = print_str_new_line(p, "std::cout << \"FPGA Time: \" << fpga_duration.count() / 10 << \" s\" << std::endl;");
    p = print_str_new_line(p, "std::chrono::duration<double> host_duration = host_end - host_begin;");
    p = print_str_new_line(p, "std::cout << \"Host Time: \" << host_duration.count() << \" s\" << std::endl;");
    p = isl_printer_end_line(p);
  }

  /* Deserialize the buffer data if necessary. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    if (module->serialize_tree && !module->in) {
      struct autosa_array_ref_group *group = module->io_groups[0];
      struct autosa_local_array_info *local_array = group->local_array;
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "host_deserialize_");
      p = isl_printer_print_str(p, local_array->array->name);
      p = isl_printer_print_str(p, "(");      
      p = print_host_serialize_arguments(p, top->kernel, group, module, 0, 0);  // TODO: add hbm support later.
      p = isl_printer_print_str(p, ");");      
      p = isl_printer_end_line(p);
    }
  }

  if (hls)
  {
    /* Restore buffer */
    p = print_str_new_line(p, "// Restore data from host buffers");
    for (int i = 0; i < prog->n_array; i++)
    {
      struct autosa_array_info *array = &prog->array[i];
      if (!autosa_array_requires_device_allocation(array))
        continue;

      if (array->copy_out)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "memcpy(");
        p = isl_printer_print_str(p, array->name);
        p = isl_printer_print_str(p, ", dev_");
        p = isl_printer_print_str(p, array->name);
        if (array->local_array->host_serialize) {
          p = isl_printer_print_str(p, "_unserialized");
        }
        if (array->local_array->n_mem_ports > 1)
        {
          p = isl_printer_print_str(p, "[0]");
        }
        p = isl_printer_print_str(p, ", ");
        p = autosa_array_info_print_size(p, array);
        p = isl_printer_print_str(p, ");");
        p = isl_printer_end_line(p);
      }
    }
    p = isl_printer_end_line(p);
    p = autosa_free_cpu_arrays_xilinx(p, prog, kernel);
  }
  else
  {
    /* Restore buffer */
    p = print_str_new_line(p, "// Restore data from host buffers");
    for (int i = 0; i < prog->n_array; i++)
    {
      struct autosa_array_info *array = &prog->array[i];
      if (!autosa_array_requires_device_allocation(array))
        continue;

      if (array->copy_out)
      {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "std::copy(dev_");
        p = isl_printer_print_str(p, array->name);
        if (array->local_array->host_serialize) {
          p = isl_printer_print_str(p, "_unserialized");
        }
        if (array->local_array->n_mem_ports > 1)
        {
          p = isl_printer_print_str(p, "[0]");
        }
        p = isl_printer_print_str(p, ".begin(), dev_");
        p = isl_printer_print_str(p, array->name);
        if (array->local_array->host_serialize) {
          p = isl_printer_print_str(p, "_unserialized");
        }
        if (array->local_array->n_mem_ports > 1)
        {
          p = isl_printer_print_str(p, "[0]");
        }
        p = isl_printer_print_str(p, ".end(), reinterpret_cast<");
        p = isl_printer_print_str(p, array->type);
        p = isl_printer_print_str(p, " *>(");
        p = isl_printer_print_str(p, array->name);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
      }
    }
  }

  return p;
}

static __isl_give isl_printer *drain_merge_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_drain_merge_func *func,
    int hls)
{
  struct autosa_array_ref_group *group = func->group;
  p = print_str_new_line(p, "// Merge results");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "for (int idx = ");
  p = isl_printer_print_int(p, group->mem_port_id);
  p = isl_printer_print_str(p, "; idx < ");
  p = isl_printer_print_int(p, group->mem_port_id + group->n_mem_ports);
  p = isl_printer_print_str(p, "; idx++) {");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, 2);
  p = isl_printer_start_line(p);
  p = autosa_array_ref_group_print_prefix(group, p);
  p = isl_printer_print_str(p, "_drain_merge(");
  p = print_drain_merge_arguments(p, func->kernel, group, func, 0, hls);
  p = isl_printer_print_str(p, ");");
  p = isl_printer_end_line(p);

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_end_line(p);
  return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_to_device_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_array_info *array, int hls)
{
  int indent;
  if (!hls)
  {
    struct autosa_local_array_info *local_array = array->local_array;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = print_str_new_line(p, "OCL_CHECK(err,");
    indent = strlen("OCL_CHECK(");
    p = isl_printer_indent(p, indent);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "err = q.enqueueMigrateMemObjects({buffer_");
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "[i]");
    p = isl_printer_print_str(p, "}, 0));");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, -indent);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
    p = isl_printer_end_line(p);
  }
  else
  {
    struct autosa_local_array_info *local_array = array->local_array;

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "memcpy(buffer_");
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "[i], dev_");
    p = isl_printer_print_str(p, array->name);
    if (local_array->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, "[i]");
    }
    p = isl_printer_print_str(p, ", ");
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_size(p, array);
    } else {
      p = autosa_array_info_print_size(p, array);
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    if (prog->scop->options->autosa->axi_stream) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int j = 0; j < ");
      if (!local_array->host_serialize) {
        printf("[AutoSA] Error: Can't generate AXI Stream interface for array: %s without serialization\n", array->name);
        exit(1);
      }
      p = autosa_array_info_print_serialize_data_size(p, array);
      p = isl_printer_print_str(p, " / ");
      p = isl_printer_print_int(p, array->n_lane);
      p = isl_printer_print_str(p, "; j++) {");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, 2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fifo_");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, ".write(buffer_");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, "[i][j]);");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print code to "p" for copying "array" back from the device to the host
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * polysa_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_from_device_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_array_info *array, int hls)
{
  struct autosa_local_array_info *local_array;
  int indent;

  local_array = array->local_array;
  if (!hls)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_io_group_refs);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    p = print_str_new_line(p, "OCL_CHECK(err,");
    indent = strlen("OCL_CHECK(");
    p = isl_printer_indent(p, indent);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "err = q.enqueueMigrateMemObjects({buffer_");
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "[i]");
    p = isl_printer_print_str(p, "}, CL_MIGRATE_MEM_OBJECT_HOST));");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, -indent);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  else
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "for (int i = 0; i < ");
    p = isl_printer_print_int(p, local_array->n_mem_ports);
    p = isl_printer_print_str(p, "; i++) {");
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);

    if (prog->scop->options->autosa->axi_stream) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "for (int j = 0; j < ");
      if (!local_array->host_serialize) {
        printf("[AutoSA] Error: Can't generate AXI Stream interface for array: %s without serialization\n", array->name);
        exit(1);
      }
      p = autosa_array_info_print_serialize_data_size(p, array);
      p = isl_printer_print_str(p, " / ");
      p = isl_printer_print_int(p, array->n_lane);
      p = isl_printer_print_str(p, "; j++) {");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, 2);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "buffer_");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, "[i][j] = fifo_");
      p = isl_printer_print_str(p, array->name);
      p = isl_printer_print_str(p, ".read();");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      p = print_str_new_line(p, "}");
    }

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "memcpy(dev_");
    p = isl_printer_print_str(p, array->name);
    if (local_array->n_mem_ports > 1)
    {
      p = isl_printer_print_str(p, "[i]");
    }
    p = isl_printer_print_str(p, ", buffer_");
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "[i], ");
    if (local_array->host_serialize) {
      p = autosa_array_info_print_serialize_size(p, array);
    } else {
      p = autosa_array_info_print_size(p, array);
    }
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
    p = isl_printer_end_line(p);
    //    p = isl_printer_start_line(p);
    //    p = isl_printer_print_str(p, "memcpy(");
    //    p = isl_printer_print_str(p, array->name);
    //    p = isl_printer_print_str(p, ", dev_");
    //    p = isl_printer_print_str(p, array->name);
    //    p = isl_printer_print_str(p, ", ");
    //    p = autosa_array_info_print_data_size(p, array);
    //    p = isl_printer_print_str(p, " * sizeof(");
    //    p = isl_printer_print_str(p, array->type);
    //    p = isl_printer_print_str(p, "));");
    //    p = isl_printer_end_line(p);
  }

  return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the autosa_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node_xilinx(__isl_take isl_printer *p,
                                                        __isl_keep isl_ast_node *node, 
                                                        struct autosa_prog *prog, 
                                                        int hls,
                                                        struct autosa_hw_top_module *top)
{
  isl_ast_expr *expr, *arg;
  isl_id *id;
  const char *name;
  struct autosa_array_info *array;
  struct autosa_kernel *kernel;
  struct autosa_drain_merge_func *func;

  expr = isl_ast_node_user_get_expr(node);
  arg = isl_ast_expr_get_op_arg(expr, 0);
  id = isl_ast_expr_get_id(arg);
  name = isl_id_get_name(id);
  if (!strcmp(name, "init_device") || !strcmp(name, "clear_device"))
    kernel = (struct autosa_kernel *)isl_id_get_user(id);
  else if (!strcmp(name, "drain_merge"))
    func = (struct autosa_drain_merge_func *)isl_id_get_user(id);
  else
    array = (struct autosa_array_info *)isl_id_get_user(id);
  isl_id_free(id);
  isl_ast_expr_free(arg);
  isl_ast_expr_free(expr);

  if (!name)
    return isl_printer_free(p);
  if (!strcmp(name, "init_device"))
    return init_device_xilinx(p, prog, kernel, hls, top);
  if (!strcmp(name, "clear_device"))
    return clear_device_xilinx(p, prog, kernel, hls, top);
  if (!strcmp(name, "drain_merge"))
    return drain_merge_xilinx(p, prog, func, hls);
  if (!array)
    return isl_printer_free(p);

  if (!prefixcmp(name, "to_device"))
    return copy_array_to_device_xilinx(p, prog, array, hls);
  else
    return copy_array_from_device_xilinx(p, prog, array, hls);

  return p;
}

/* Set kernel arguments:
 * - arrays
 * - parameters
 * - host iterators
 */
static __isl_give isl_printer *print_set_kernel_arguments_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_kernel *kernel)
{
  int n_arg = 0, n;
  unsigned nparam;
  isl_space *space;
  const char *type;

  /* array */
  /*   for (int i = 0; i < prog->n_array; ++i) {
    int required;

    required = autosa_kernel_requires_array_argument(kernel, i);
    if (required < 0)
      return isl_printer_free(p);
    if (!required)
      continue;

    struct autosa_array_info *array = &prog->array[i];

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "OCL_CHECK(err, err = krnl.setArg(");
    p = isl_printer_print_int(p, n_arg);    
    p = isl_printer_print_str(p, ", buffer_");    
    p = isl_printer_print_str(p, array->name);
    p = isl_printer_print_str(p, "));");
    p = isl_printer_end_line(p);
    n_arg++;
  } */
  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (autosa_kernel_requires_array_argument(kernel, i))
    {
      if (autosa_array_is_scalar(local_array->array))
      {
        /* Scalar */
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "OCL_CHECK(err, err = krnl.setArg(");
        p = isl_printer_print_int(p, n_arg);
        p = isl_printer_print_str(p, ", ");
        p = isl_printer_print_str(p, local_array->array->name);
        p = isl_printer_print_str(p, "));");
        p = isl_printer_end_line(p);
        n_arg++;
      }
      else
      {
        for (int j = 0; j < local_array->n_io_group_refs; j++)
        {
          //auto ref_port_map = local_array->group_ref_mem_port_map.at(j);
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "OCL_CHECK(err, err = krnl.setArg(");
          p = isl_printer_print_int(p, n_arg);
          p = isl_printer_print_str(p, ", buffer_");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, "[");          
          //p = isl_printer_print_int(p, ref_port_map.second);          
          p = isl_printer_print_int(p, local_array->group_ref_mem_port_map.at(j * 2 + 1));
          p = isl_printer_print_str(p, "]));");
          p = isl_printer_end_line(p);
          n_arg++;
        }
      }
    }
  }

  /* param */
  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; ++i)
  {
    const char *name;
    name = isl_space_get_dim_name(space, isl_dim_param, i);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "OCL_CHECK(err, err = krnl.setArg(");
    p = isl_printer_print_int(p, n_arg);
    p = isl_printer_print_str(p, ", ");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, "));");
    p = isl_printer_end_line(p);
    n_arg++;
  }
  isl_space_free(space);

  /* host iterator */
  n = isl_space_dim(kernel->space, isl_dim_set);
  type = isl_options_get_ast_iterator_type(prog->ctx);
  for (int i = 0; i < n; ++i)
  {
    const char *name;
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "OCL_CHECK(err, err = krnl.setArg(");
    p = isl_printer_print_int(p, n_arg);
    p = isl_printer_print_str(p, ", ");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, "));");
    p = isl_printer_end_line(p);
    n_arg++;
  }

  return p;
}

/* Print the header of the given kernel to both gen->hls.kernel_h
 * and gen->hls.kernel_c.
 */
static void print_kernel_headers_xilinx(struct autosa_prog *prog,
                                        struct autosa_kernel *kernel, struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  if (!hls->hls)
  {
    p = print_str_new_line(p, "extern \"C\" {");
  }
  p = print_kernel_header(p, prog, kernel, hls, 1);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  if (!hls->hls)
  {
    p = print_str_new_line(p, "}");
  }

  isl_printer_free(p);

  if (hls->hcl) {
    /* Print the kernel declaration to a seperate file. */
    p = isl_printer_to_file(prog->ctx, hls->hcl_decl);
    p = isl_printer_set_output_format(p, ISL_FORMAT_C);
    p = print_kernel_header(p, prog, kernel, hls, 0);
    p = isl_printer_end_line(p);
    isl_printer_free(p);
  }  
}

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user_xilinx(__isl_take isl_printer *p,
                                                      __isl_take isl_ast_print_options *print_options,
                                                      __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int is_user;
  struct autosa_kernel *kernel;
  struct autosa_kernel_stmt *stmt;
  struct print_host_user_data *data;
  struct hls_info *hls;
  struct autosa_hw_top_module *top;

  isl_ast_print_options_free(print_options);

  data = (struct print_host_user_data *)user;
  hls = data->hls;
  top = data->top;

  id = isl_ast_node_get_annotation(node);
  if (!id)
  {
    return print_device_node_xilinx(p, node, data->prog, hls->hls, top);
  }

  is_user = !strcmp(isl_id_get_name(id), "user");
  kernel = is_user ? NULL : (struct autosa_kernel *)isl_id_get_user(id);
  stmt = is_user ? (struct autosa_kernel_stmt *)isl_id_get_user(id) : NULL;
  isl_id_free(id);

  if (is_user)
    return autosa_kernel_print_domain(p, stmt);

  if (!hls->hls)
  {
    /* Print OpenCL host. */
    p = ppcg_start_block(p);

    p = print_set_kernel_arguments_xilinx(p, data->prog, kernel);
    p = print_str_new_line(p, "q.finish();");
    p = isl_printer_end_line(p);

    p = print_str_new_line(p, "// Warm up");
    p = print_str_new_line(p, "OCL_CHECK(err, err = q.enqueueTask(krnl));");
    p = print_str_new_line(p, "q.finish();");
    p = isl_printer_end_line(p);

    p = print_str_new_line(p, "fpga_begin = std::chrono::high_resolution_clock::now();");
    p = isl_printer_end_line(p);

    p = print_str_new_line(p, "// Launch the kernel");
    p = print_str_new_line(p, "for (int i = 0; i < 10; i++)");
    p = print_str_new_line(p, "  OCL_CHECK(err, err = q.enqueueTask(krnl));");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "q.finish();");
    p = print_str_new_line(p, "fpga_end = std::chrono::high_resolution_clock::now();");

    p = ppcg_end_block(p);
    p = isl_printer_end_line(p);
  }
  else
  {
    /* Print HLS host. */
    p = ppcg_start_block(p);

    p = print_str_new_line(p, "// Launch the kernel");
    p = isl_printer_start_line(p);
    if (data->prog->scop->options->autosa->hcl) {
      p = isl_printer_print_str(p, "autosa_func"); 
    } else {
      p = isl_printer_print_str(p, "kernel0");
    }
    p = isl_printer_print_str(p, "(");
    p = print_kernel_arguments(p, data->prog, kernel, 0, hls);
    p = isl_printer_print_str(p, ");");
    p = isl_printer_end_line(p);

    p = ppcg_end_block(p);
  }
  /* Print the top kernel header. */
  print_kernel_headers_xilinx(data->prog, kernel, data->hls);

  return p;
}

/* Print the header of the given module.
 */
static __isl_give isl_printer *print_module_header_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary)
{
  int n = isl_id_list_n_id(module->inst_ids);;
  int first = 1;

  if (n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    /* Print the index template */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");  
    for (int i = 0; i < n; i++) {
      if (!first)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);    
      first = 0;
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  p = isl_printer_print_str(p, "(");
  p = print_module_arguments(p, prog, module->kernel, module, 1, XILINX_HW, inter, -1, boundary, 0);
  p = isl_printer_print_str(p, ")");

  return p;
}

/* Print the header of the given module to both gen->hls.kernel_h
 * and gen->hls.kernel_c
 * If "inter" is -1, this is a normal module call.
 * If "inter" is 0, this is a intra_trans module call.
 * If "inter" is 1, this is a inter_trans module call.
 */
static isl_stat print_module_headers_xilinx(
    struct autosa_prog *prog, struct autosa_hw_module *module,
    struct hls_info *hls, int inter, int boundary)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_header_xilinx(p, prog, module, inter, boundary);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_header_xilinx(p, prog, module, inter, boundary);
  //p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print out variable declarations on Xilinx platforms.
 * The local variable can be mapped to different memory resources:
 * FF, LUTRAM, BRAM, URAM.
 */
static __isl_give isl_printer *print_module_var_xilinx(
    __isl_take isl_printer *p,
    struct autosa_kernel_var *var, int double_buffer,
    struct autosa_hw_module *module)
{
  int j;
  int use_memory = 0; // 0: FF 1: LUTRAM 2: BRAM 3: URAM
  use_memory = extract_memory_type(module, var, module->options->autosa->uram);

  p = isl_printer_start_line(p);
  if (var->array->local_array->is_sparse && module->type != PE_MODULE) {
    p = isl_printer_print_str(p, var->array->name);
    p = isl_printer_print_str(p, "_s_t");
    p = isl_printer_print_int(p, var->n_lane);
  } else {
    //if (var->n_lane == 1)
    //  p = isl_printer_print_str(p, var->array->type);
    //else {
      p = isl_printer_print_str(p, var->array->name);    
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    //}
  }
  p = isl_printer_print_str(p, " ");
  p = isl_printer_print_str(p, var->name);
  if (double_buffer)
    p = isl_printer_print_str(p, "_ping");
  for (j = 0; j < isl_vec_size(var->size); ++j)
  {
    isl_val *v;

    p = isl_printer_print_str(p, "[");
    v = isl_vec_get_element_val(var->size, j);
    p = isl_printer_print_val(p, v);
    isl_val_free(v);
    p = isl_printer_print_str(p, "]");
  }
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  if (use_memory && var->n_part != 1)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    p = isl_printer_print_str(p, " dim=");
    p = isl_printer_print_int(p, isl_vec_size(var->size));
    p = isl_printer_print_str(p, " factor=");
    p = isl_printer_print_int(p, var->n_part);
    p = isl_printer_print_str(p, " cyclic");
    p = isl_printer_end_line(p);
  } else if (use_memory == 0) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    p = isl_printer_print_str(p, " dim=0 complete");
    p = isl_printer_end_line(p);
  }

  if (use_memory)
  {
    //if (double_buffer)
    //{
    //  p = isl_printer_start_line(p);
    //  p = isl_printer_print_str(p, "#pragma HLS ARRAY_MAP variable=");
    //  p = isl_printer_print_str(p, var->name);
    //  p = isl_printer_print_str(p, "_ping instance=");
    //  p = isl_printer_print_str(p, var->name);
    //  p = isl_printer_print_str(p, " horizontal");
    //  p = isl_printer_end_line(p);
    //}
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "#pragma HLS RESOURCE variable=");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_ping");
    if (module->type == IO_MODULE && module->data_pack_inter == module->data_pack_intra)
      p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_1P_LUTRAM" : (use_memory == 2 ? " core=RAM_1P_BRAM" : " core=RAM_1P_URAM"));
    else
      p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_2P_LUTRAM" : (use_memory == 2 ? " core=RAM_2P_BRAM" : " core=RAM_2P_URAM"));
    p = isl_printer_end_line(p);

    if (var->array->local_array->is_sparse) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS DATA_PACK variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_ping");
      p = isl_printer_end_line(p);  
    }
  }

  /* Print pong buffer */
  if (double_buffer)
  {
    p = isl_printer_start_line(p);
    if (var->array->local_array->is_sparse) {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_s_t");      
      p = isl_printer_print_int(p, var->n_lane);      
    } else {
      if (var->n_lane == 1)
        p = isl_printer_print_str(p, var->array->type);
      else {
        p = isl_printer_print_str(p, var->array->name);        
        p = isl_printer_print_str(p, "_t");
        p = isl_printer_print_int(p, var->n_lane);
      }
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    if (double_buffer)
      p = isl_printer_print_str(p, "_pong");
    for (j = 0; j < isl_vec_size(var->size); ++j)
    {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
    if (use_memory && var->n_part != 1)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_pong");
      p = isl_printer_print_str(p, " dim=");
      p = isl_printer_print_int(p, isl_vec_size(var->size));
      p = isl_printer_print_str(p, " factor=");
      p = isl_printer_print_int(p, var->n_part);
      p = isl_printer_print_str(p, " cyclic");
      p = isl_printer_end_line(p);
    } else if (use_memory == 0) {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS ARRAY_PARTITION variable=");
      p = isl_printer_print_str(p, var->name);
      if (double_buffer)
        p = isl_printer_print_str(p, "_pong");
      p = isl_printer_print_str(p, " dim=0 complete");
      p = isl_printer_end_line(p);
    }

    if (use_memory)
    {
      //p = isl_printer_start_line(p);
      //p = isl_printer_print_str(p, "#pragma HLS ARRAY_MAP variable=");
      //p = isl_printer_print_str(p, var->name);
      //p = isl_printer_print_str(p, "_pong instance=");
      //p = isl_printer_print_str(p, var->name);
      //p = isl_printer_print_str(p, " horizontal");
      //p = isl_printer_end_line(p);

      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "#pragma HLS RESOURCE variable=");
      p = isl_printer_print_str(p, var->name);
      p = isl_printer_print_str(p, "_pong");
      if (module->type == IO_MODULE && module->data_pack_inter == module->data_pack_intra)
        p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_1P_LUTRAM" : (use_memory == 2 ? " core=RAM_1P_BRAM" : " core=RAM_1P_URAM"));
      else
        p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_2P_LUTRAM" : (use_memory == 2 ? " core=RAM_2P_BRAM" : " core=RAM_2P_URAM"));
      //p = isl_printer_print_str(p, use_memory == 1 ? " core=RAM_2P_LUTRAM" : (use_memory == 2 ? " core=RAM_2P_BRAM" : " core=RAM_2P_URAM"));
      p = isl_printer_end_line(p);

      if (var->array->local_array->is_sparse) {
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "#pragma HLS DATA_PACK variable=");
        p = isl_printer_print_str(p, var->name);
        p = isl_printer_print_str(p, "_pong");
        p = isl_printer_end_line(p);
      }
    }
  }

  return p;
}

static __isl_give isl_printer *print_module_vars_xilinx(__isl_take isl_printer *p,
                                                        struct autosa_hw_module *module, int inter)
{
  int i, n;
  isl_space *space;
  const char *type;

  if (inter == -1)
  {
    for (i = 0; i < module->n_var; ++i)
      p = print_module_var_xilinx(p, &module->var[i], module->double_buffer, module);
  }

  if (module->double_buffer && inter == -1)
  {
    type = isl_options_get_ast_iterator_type(module->kernel->ctx);

    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "bool arb = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
    p = isl_printer_end_line(p);
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, module->in ? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
    p = isl_printer_end_line(p);
    /* iterators */
    space = (module->in) ? module->intra_space : module->inter_space;
    n = isl_space_dim(space, isl_dim_set);
    for (int i = 0; i < n; i++)
    {
      const char *name;
      name = isl_space_get_dim_name(space, isl_dim_set, i);
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, type);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, name);
      p = isl_printer_print_str(p, "_prev");
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

//static __isl_give isl_printer *print_module_stmt(__isl_take isl_printer *p,
//                                                 __isl_take isl_ast_print_options *print_options,
//                                                 __isl_keep isl_ast_node *node, void *user)
//{
//  isl_id *id;
//  struct autosa_kernel_stmt *stmt;
//  struct print_hw_module_data *hw_data = (struct print_hw_module_data *)(user);
//  struct autosa_hw_module *module = hw_data->module;
//
//  id = isl_ast_node_get_annotation(node);
//  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
//  isl_id_free(id);
//
//  isl_ast_print_options_free(print_options);
//
//  switch (stmt->type)
//  {
//    //    case POLYSA_KERNEL_STMT_COPY:
//    //      return autosa_kernel_print_copy(p, stmt);
//    //    case POLYSA_KERNEL_STMT_SYNC:
//    //      return print_sync(p, stmt);
//  case AUTOSA_KERNEL_STMT_DOMAIN:
//    return autosa_kernel_print_domain(p, stmt);
//  case AUTOSA_KERNEL_STMT_IO:
//    return autosa_kernel_print_io(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_TRANSFER:
//    return autosa_kernel_print_io_transfer(p, stmt, hw_data->hls, 
//              module->options->autosa->double_buffer_style == 0?
//                hw_data->iterator_prefix : NULL);
//  case AUTOSA_KERNEL_STMT_IO_DRAM:
//    return autosa_kernel_print_io_dram(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_TRANS:
//    return autosa_kernel_print_inter_trans(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_TRANS:
//    return autosa_kernel_print_intra_trans(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTER_INTRA:
//    return autosa_kernel_print_inter_intra(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_INTRA_INTER:
//    return autosa_kernel_print_intra_inter(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_IO_MODULE_CALL_STATE_HANDLE:
//    return autosa_kernel_print_state_handle(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_DRAIN_MERGE:
//    return autosa_kernel_print_drain_merge(p, stmt, hw_data->hls);
//  case AUTOSA_KERNEL_STMT_HOST_SERIALIZE:
//    return autosa_kernel_print_host_serialize(p, stmt, hw_data->hls);
//  }
//
//  return p;
//}

static __isl_give isl_printer *print_for_with_pipeline(
    __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma HLS PIPELINE II=1");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_with_unroll(
    __isl_keep isl_ast_node *node, __isl_take isl_printer *p,
    __isl_take isl_ast_print_options *print_options)
{
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "#pragma HLS UNROLL");
  p = isl_printer_end_line(p);

  p = isl_ast_node_for_print(node, p, print_options);

  return p;
}

static __isl_give isl_printer *print_for_xilinx(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  int pipeline;
  int unroll;

  pipeline = 0;
  unroll = 0;
  id = isl_ast_node_get_annotation(node);

  if (id)
  {
    struct autosa_ast_node_userinfo *info;

    info = (struct autosa_ast_node_userinfo *)isl_id_get_user(id);
    if (info && info->is_pipeline)
      pipeline = 1;
    if (info && info->is_unroll)
      unroll = 1;
  }

  if (pipeline)
    p = print_for_with_pipeline(node, p, print_options);
  else if (unroll)
    p = print_for_with_unroll(node, p, print_options);
  else
    p = isl_ast_node_for_print(node, p, print_options);

  isl_id_free(id);

  return p;
}

///* This function simply skips all for loops to print. */
//static __isl_give isl_printer *print_for_skip(__isl_take isl_printer *p,
//                                              __isl_take isl_ast_print_options *print_options,
//                                              __isl_keep isl_ast_node *node, void *user)
//{
//  return p;
//}

/* Print the intra_trans module.
 */
static __isl_give isl_printer *autosa_print_intra_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (!module->intra_tree)
    return p;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_xilinx(prog, module, hls, 0, boundary);
  fprintf(hls->kernel_c, " {\n");
  if (hls->target == XILINX_HW) {
    /* If double buffer is disabled, the module is then inlined to reduce the 
     * overheads.
     * Double buffer module can't inlined, this might cause deadlocks.
     */
    //printf("intra trans module name: %s %d\n", module->name, module->use_FF);
    if (module->double_buffer)
      fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
    else   
      fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  }
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_module_vars_xilinx(p, module, 0);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!intra_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }
  /* For local reduce, print the buffer initialization. */
  for (int i = 0; i < module->n_var; i++) {
    if (module->var[i].init_required) {
      p = autosa_print_var_initialization(p, &module->var[i], hls->target);
    }
  }
  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  if (hls->target == XILINX_HW)
  {
    print_options = isl_ast_print_options_set_print_for(print_options,
                                                        &print_for_xilinx, &hw_data);
  }

  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

/* Print the inter_trans module.
 */
static __isl_give isl_printer *autosa_print_inter_trans_module(
    __isl_take isl_printer *p,
    struct autosa_hw_module *module, struct autosa_prog *prog,
    struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  if (boundary) {
    if (!module->boundary_inter_tree)
      return p;
  } else {
    if (!module->inter_tree)
      return p;
  }  

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  if (hls->target == XILINX_HW)
    print_module_headers_xilinx(prog, module, hls, 1, boundary);
  fprintf(hls->kernel_c, " {\n");
  if (hls->target == XILINX_HW) {
    if (module->double_buffer)
      fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
    else
      fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  }
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  if (hls->target == XILINX_HW)
    p = print_module_vars_xilinx(p, module, 1);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->double_buffer)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "if (!inter_trans_en) return;");
    p = isl_printer_end_line(p);
    p = isl_printer_end_line(p);
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  if (hls->target == XILINX_HW)
  {
    print_options = isl_ast_print_options_set_print_for(print_options,
                                                        &print_for_xilinx, &hw_data);
  }

  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  return p;
}

///* Print the drained data merge functions. 
// */
//static isl_stat print_drain_merge_funcs(
//    struct autosa_kernel *kernel,
//    struct autosa_drain_merge_func **funcs, int n_funcs,
//    struct hls_info *hls)
//{
//  isl_printer *p;
//  isl_ctx *ctx;
//
//  if (n_funcs == 0)
//    return isl_stat_ok;
//
//  ctx = kernel->ctx;
//  if (!hls->hls)
//    p = isl_printer_to_file(kernel->ctx, hls->host_h);
//  else
//    p = isl_printer_to_file(kernel->ctx, hls->kernel_h);
//  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
//  for (int i = 0; i < n_funcs; i++)
//  {
//    struct autosa_array_ref_group *group = funcs[i]->group;
//    isl_ast_print_options *print_options;
//    struct print_hw_module_data hw_data = {hls, NULL, NULL, NULL};
//
//    p = print_str_new_line(p, "/* Helper Function */");
//    p = isl_printer_start_line(p);
//    if (hls->hls)
//      p = isl_printer_print_str(p, "inline ");
//    p = isl_printer_print_str(p, "void ");
//    p = autosa_array_ref_group_print_prefix(group, p);
//    p = isl_printer_print_str(p, "_drain_merge(");
//    p = print_drain_merge_arguments(p, kernel, group, funcs[i], 1, hls->hls);
//    p = isl_printer_print_str(p, "){");
//    p = isl_printer_end_line(p);
//    p = isl_printer_indent(p, 2);
//
//    p = print_str_new_line(p, "/* Variable Declaration */");
//    if (!hls->hls)
//      print_func_iterators(hls->host_h, funcs[i]);
//    else
//      print_func_iterators(hls->kernel_h, funcs[i]);
//    p = print_str_new_line(p, "/* Variable Declaration */");
//    p = isl_printer_end_line(p);
//
//    print_options = isl_ast_print_options_alloc(ctx);
//    print_options = isl_ast_print_options_set_print_user(print_options,
//                                                         &print_module_stmt, &hw_data);
//    p = isl_ast_node_print(funcs[i]->device_tree, p, print_options);
//
//    p = isl_printer_indent(p, -2);
//    p = print_str_new_line(p, "}");
//    p = print_str_new_line(p, "/* Helper Function */");
//  }
//  p = isl_printer_end_line(p);
//  isl_printer_free(p);
//
//  return isl_stat_ok;
//}

static __isl_give isl_printer *print_module_core_header_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary, int serialize, int types)
{
  int n = isl_id_list_n_id(module->inst_ids);
  if (types && n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    /* Print the template */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);  
  if (types)
    p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  if (serialize)
    p = isl_printer_print_str(p, "_serialize");
  if (!types && n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_print_str(p, "<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "p");
      p = isl_printer_print_int(p, i);
    }
    p = isl_printer_print_str(p, ">");
  }
  p = isl_printer_print_str(p, "(");
  if (!types) {
    p = isl_printer_end_line(p);
    p = isl_printer_indent(p, 2);
    p = isl_printer_start_line(p);  
  }
  p = print_module_arguments(p, prog, module->kernel, module, types,
                             XILINX_HW, inter, -1, boundary, serialize);                             
  p = isl_printer_print_str(p, ")");
  if (!types) {
    p = isl_printer_indent(p, -2);
  }

  return p;
}

static __isl_give isl_printer *print_module_core_headers_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_hw_module *module, struct hls_info *hls,
    int inter, int boundary, int serialize, int types)
{
  p = print_module_core_header_xilinx(p, prog, module, inter, boundary, serialize, types);

  return p;
}

static __isl_give isl_printer *print_module_wrapper_header_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_module *module,
    int inter, int boundary)
{
  int n = isl_id_list_n_id(module->inst_ids);
  if (n > 0 && prog->scop->options->autosa->use_cplusplus_template) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "template<");
    for (int i = 0; i < n; i++) {
      if (i > 0)
        p = isl_printer_print_str(p, ", ");
      p = isl_printer_print_str(p, "int p");
      p = isl_printer_print_int(p, i);        
    }
    p = isl_printer_print_str(p, ">");
    p = isl_printer_end_line(p);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  p = isl_printer_print_str(p, module->name);
  if (inter == 0)
    p = isl_printer_print_str(p, "_intra_trans");
  else if (inter == 1)
    p = isl_printer_print_str(p, "_inter_trans");
  if (boundary)
    p = isl_printer_print_str(p, "_boundary");
  p = isl_printer_print_str(p, "_wrapper");
  p = isl_printer_print_str(p, "(");
  p = print_module_arguments(p, prog, module->kernel, module, 1,
                             XILINX_HW, inter, -1, boundary, 0);
  p = isl_printer_print_str(p, ")");

  return p;
}

static isl_stat print_module_wrapper_headers_xilinx(
    struct autosa_prog *prog, struct autosa_hw_module *module,
    struct hls_info *hls, int inter, int boundary)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_wrapper_header_xilinx(p, prog, module, inter, boundary);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_module_wrapper_header_xilinx(p, prog, module, inter, boundary);
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

/* Print the body for a module that connects to the DRAM with serialized data. 
 */
//static __isl_give isl_printer *print_module_serialize_body(
//    __isl_take isl_printer *p, struct autosa_hw_module *module)
//{
//  isl_pw_qpolynomial *total_bound_pwq = module->io_groups[0]->array->local_array->serialize_bound;
//  long int total_bound = -1;  
//  int ele_size = module->io_groups[0]->array->size; // bytes
//  total_bound = convert_pwqpoly_to_int(total_bound_pwq);
//  int data_pack_in = module->data_pack_serialize;
//  int data_pack_out = module->data_pack_inter;  
//
//  if (data_pack_in == data_pack_out) {    
//    if (module->in) {
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int i = 0; i < ");
//      p = isl_printer_print_int(p, total_bound / data_pack_out);
//      p = isl_printer_print_str(p, "; i++) {");
//      p = isl_printer_end_line(p);
//    
//      p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
//      p = isl_printer_indent(p, 2);
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);      
//      p = isl_printer_print_str(p, " fifo_data;");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "fifo_data = ");
//      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      p = isl_printer_print_str(p, "[i];");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = autosa_array_ref_group_print_fifo_name(module->io_groups[0], p);
//      p = isl_printer_print_str(p, "_local_out.write(fifo_data);");      
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//    } else {
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int i = 0; i < ");
//      p = isl_printer_print_int(p, total_bound / data_pack_out);
//      p = isl_printer_print_str(p, "; i++) {");
//      p = isl_printer_end_line(p);
//    
//      p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
//      p = isl_printer_indent(p, 2);
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);      
//      p = isl_printer_print_str(p, " fifo_data;");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "fifo_data = ");
//      p = autosa_array_ref_group_print_fifo_name(module->io_groups[0], p);
//      p = isl_printer_print_str(p, "_local_in.read();");
//      //p = isl_printer_print_str(p, "fifo_data = fifo_");
//      //p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      //if (module->type == DRAIN_MODULE)      
//        //p = isl_printer_print_str(p, "_drain");
//      //p = isl_printer_print_str(p, "_local_in.read();");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      p = isl_printer_print_str(p, "[i] = fifo_data;");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//    }
//  } else {    
//    if (module->in) {
//      /* [type] fifo_data; */
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
//      p = isl_printer_print_str(p, " fifo_data;");
//      p = isl_printer_end_line(p);
//
//      /* [type2] mem_data; */
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);
//      p = isl_printer_print_str(p, " mem_data;");
//      p = isl_printer_end_line(p);
//      
//      p = isl_printer_start_line(p);
//      if (data_pack_out == 1) {
//        /* union {unsigned int ui; [type] ut;} u; */
//        p = isl_printer_print_str(p, "union {unsigned int ui; ");
//        p = isl_printer_print_str(p, module->io_groups[0]->array->type);
//        p = isl_printer_print_str(p, " ut;} u;");        
//      }
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int i = 0; i < ");
//      p = isl_printer_print_int(p, total_bound / data_pack_in);
//      p = isl_printer_print_str(p, "; i++) {");
//      p = isl_printer_end_line(p);
//    
//      p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
//      p = isl_printer_indent(p, 2);
//
//      /* mem_data = array[]; */
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "mem_data = ");
//      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      p = isl_printer_print_str(p, "[i];");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int p = 0; p < ");
//      p = isl_printer_print_int(p, data_pack_in / data_pack_out);
//      p = isl_printer_print_str(p, "; p++) {");
//      p = isl_printer_end_line(p);
//      p = isl_printer_indent(p, 2);
//
//      /* fifo_data = mem_data(..,..); */
//      p = isl_printer_start_line(p);
//      if (data_pack_out == 1) {
//        p = isl_printer_print_str(p, "u.ui = (unsigned int)mem_data(");
//        p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
//        p = isl_printer_print_str(p, ", 0);");
//        p = isl_printer_end_line(p);
//
//        p = print_str_new_line(p, "fifo_data = u.ut;");
//      } else {
//        p = isl_printer_print_str(p, "fifo_data = mem_data(");
//        p = isl_printer_print_int(p, ele_size * data_pack_out * 8 - 1);
//        p = isl_printer_print_str(p, ", 0);");
//      }
//      p = isl_printer_end_line(p);
//
//      /* mem_data = mem_data >> .. */
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "mem_data = mem_data >> ");
//      p = isl_printer_print_int(p, ele_size * data_pack_out * 8);
//      p = isl_printer_print_str(p, ";");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = autosa_array_ref_group_print_fifo_name(module->io_groups[0], p);
//      p = isl_printer_print_str(p, "_local_out.write(fifo_data);");
//      //p = isl_printer_print_str(p, "fifo_");
//      //p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      //p = isl_printer_print_str(p, "_local_out.write(fifo_data);");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//    } else {
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int i = 0; i < ");
//      p = isl_printer_print_int(p, total_bound / data_pack_in);
//      p = isl_printer_print_str(p, "; i++) {");
//      p = isl_printer_end_line(p);
//    
//      p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
//      p = isl_printer_indent(p, 2);
//
//      /* [type] fifo_data; */
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);      
//      p = isl_printer_print_str(p, " fifo_data;");
//      p = isl_printer_end_line(p);      
//
//      /* [type2] mem_data; */
//      p = isl_printer_start_line(p);
//      p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_in);      
//      p = isl_printer_print_str(p, " mem_data;");
//      p = isl_printer_end_line(p);      
//
//      if (data_pack_out == 1) {
//        /* union {unsigned int ui; [type] ut;} u; */
//        p = isl_printer_start_line(p);
//        p = isl_printer_print_str(p, "union {unsigned int ui; ");
//        p = isl_printer_print_str(p, module->io_groups[0]->array->type);
//        p = isl_printer_print_str(p, " ut;} u;");        
//        p = isl_printer_end_line(p);
//      }
//
//      p = isl_printer_start_line(p);
//      if (data_pack_out == 1) {
//        p = isl_printer_print_str(p, "ap_uint<");
//        p = isl_printer_print_int(p, module->io_groups[0]->array->size * 8);
//        p = isl_printer_print_str(p, ">");
//      } else {
//        p = autosa_print_array_type_with_lane(p, module->io_groups[0]->array, data_pack_out);
//      }      
//      p = isl_printer_print_str(p, " mem_data_split[");
//      p = isl_printer_print_int(p, data_pack_in / data_pack_out);
//      p = isl_printer_print_str(p, "];");
//      p = isl_printer_end_line(p);
//
//      p = print_str_new_line(p, "#pragma HLS ARRAY_PARTITION variable=mem_data_split complete");
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "for (int p = 0; p < ");
//      p = isl_printer_print_int(p, data_pack_in / data_pack_out);
//      p = isl_printer_print_str(p, "; p++) {");
//      p = isl_printer_end_line(p);
//      p = isl_printer_indent(p, 2);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "fifo_data = ");
//      p = autosa_array_ref_group_print_fifo_name(module->io_groups[0], p);
//      p = isl_printer_print_str(p, "_local_in.read();");
//      //p = isl_printer_print_str(p, "fifo_data = fifo_");
//      //p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      //if (module->type == DRAIN_MODULE)      
//        //p = isl_printer_print_str(p, "_drain");
//      //p = isl_printer_print_str(p, "_local_in.read();");
//      p = isl_printer_end_line(p);
//
//      if (data_pack_out == 1) {
//        p = print_str_new_line(p, "u.ut = fifo_data;");
//
//        p = isl_printer_start_line(p);
//        p = isl_printer_print_str(p, "mem_data_split[n] = ap_uint<");
//        p = isl_printer_print_int(p, module->io_groups[0]->array->size * 8);
//        p = isl_printer_print_str(p, ">(u.ui);");
//        p = isl_printer_end_line(p);
//      } else {
//        p = print_str_new_line(p, "mem_data_split[p] = fifo_data;");
//      }
//      
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, "mem_data = (");
//      for (int i = data_pack_in / data_pack_out - 1; i >= 0; i--) {
//        if (i < data_pack_in / data_pack_out - 1)
//          p = isl_printer_print_str(p, ", ");
//        p = isl_printer_print_str(p, "mem_data_split[");
//        p = isl_printer_print_int(p, i);
//        p = isl_printer_print_str(p, "]");
//      }
//      p = isl_printer_print_str(p, ");");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_start_line(p);
//      p = isl_printer_print_str(p, module->io_groups[0]->array->name);
//      p = isl_printer_print_str(p, "[i] = mem_data;");
//      p = isl_printer_end_line(p);
//
//      p = isl_printer_indent(p, -2);
//      p = print_str_new_line(p, "}");
//    }
//  }
//
//  return p;
//}

/* Print the serializaztion module that connects the external memory to the 
 * top-level I/O module. 
 */
static __isl_give isl_printer *autosa_print_serialize_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{  
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);  

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  if (hls->target == XILINX_HW)
    p = print_module_core_headers_xilinx(p, prog, module, hls, -1, boundary, 1, 1); // TODO
  fprintf(hls->kernel_c, " {\n");  
  fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");  
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);    
  }
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  p = print_module_serialize_body(p, module, hls);
  p = isl_printer_indent(p, -2);
  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);
  return p;
}

/* Print the default module. 
 * For PE modules, we will print a wrapper function to speedup the HLS 
 * synthesis. 
 * For the rest of the modules, wrapper is disabled. 
 */
static __isl_give isl_printer *autosa_print_default_module(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;
  } else {
    if (!module->boundary_tree)
      return p;
  }    

  bool wrapper = 0;
  struct print_hw_module_data hw_data = {hls, prog, module, NULL};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);
  
  /* Print wrapper for PE and L1 IO module */
  if (module->type == PE_MODULE || (module->type != PE_MODULE && module->level == 1)) 
    wrapper = 1;  

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  //if (hls->target == XILINX_HW)
  p = print_module_core_headers_xilinx(p, prog, module, hls, -1, boundary, 0, 1);
  fprintf(hls->kernel_c, " {\n");
  if (!boundary || !wrapper)
    fprintf(hls->kernel_c, "#pragma HLS INLINE OFF\n");
  else
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */");
  if (!prog->scop->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);  
  }
  if (prog->scop->options->autosa->block_sparse) {
    for (int i = 0; i < module->n_io_group; i++) {
      struct autosa_array_ref_group *group = module->io_groups[i];
      if (group->local_array->array_type == AUTOSA_EXT_ARRAY) {      
        int n_lane = get_io_group_n_lane(module, NULL, group);
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, group->array->name);
        if (group->local_array->is_sparse)
          p = isl_printer_print_str(p, "_s_t");
        else
          p = isl_printer_print_str(p, "_t");      
        p = isl_printer_print_int(p, n_lane);
        p = isl_printer_print_str(p, " fifo_data_");
        p = isl_printer_print_str(p, group->array->name);
        p = isl_printer_print_str(p, ";");
        p = isl_printer_end_line(p);
      }
    }
  }
  p = print_module_vars_xilinx(p, module, -1);  
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  if (module->credit && !module->in)
  {
    if (hls->target == XILINX_HW)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "credit.write(1);");
      p = isl_printer_end_line(p);
    }
  }

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  if (hls->target == XILINX_HW)
  {    
    print_options = isl_ast_print_options_set_print_for(print_options,
                                                        &print_for_xilinx, &hw_data);    
  }

  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);

  if (module->credit && module->in)
  {
    if (hls->target == XILINX_HW)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "int token = credit.read();");
      p = isl_printer_end_line(p);
    }
  }

  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  if (wrapper) {
    /* Print wrapper. */
    if (hls->target == XILINX_HW)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "/* Module Definition */");
      p = isl_printer_end_line(p);

      print_module_wrapper_headers_xilinx(prog, module, hls, -1, boundary);

      fprintf(hls->kernel_c, " {\n");
      p = isl_printer_indent(p, 2);

      p = print_module_core_headers_xilinx(p, prog, module, hls, -1, boundary, 0, 0);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);

      p = isl_printer_indent(p, -2);
      fprintf(hls->kernel_c, "}\n");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "/* Module Definition */");
      p = isl_printer_end_line(p);

      p = isl_printer_end_line(p);
    }
  }

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_header_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module, int types)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  if (types)
    p = isl_printer_print_str(p, "void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in" : "_out");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, types, XILINX_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_core_headers_xilinx(
    __isl_take isl_printer *p, struct autosa_prog *prog,
    struct autosa_pe_dummy_module *module, struct hls_info *hls, int types)
{
  p = print_pe_dummy_module_core_header_xilinx(p, prog, module, types);

  return p;
}

static __isl_give isl_printer *print_pe_dummy_module_wrapper_header_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module)
{
  struct autosa_array_ref_group *group = module->io_group;

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "void ");
  // group_name
  p = isl_printer_print_str(p, group->array->name);
  if (group->group_type == AUTOSA_IO_GROUP)
  {
    if (group->local_array->n_io_group > 1)
    {
      p = isl_printer_print_str(p, "_");
      p = isl_printer_print_int(p, group->nr);
    }
  }
  else if (group->group_type == AUTOSA_DRAIN_GROUP)
  {
    p = isl_printer_print_str(p, "_");
    p = isl_printer_print_str(p, "drain");
  }
  p = isl_printer_print_str(p, "_PE_dummy");
  p = isl_printer_print_str(p, module->in? "_in": "_out");
  p = isl_printer_print_str(p, "_wrapper");
  p = isl_printer_print_str(p, "(");
  p = print_pe_dummy_module_arguments(p, prog, module->module->kernel,
                                      module, 1, XILINX_HW);
  p = isl_printer_print_str(p, ")");

  return p;
}

static isl_stat print_pe_dummy_module_wrapper_headers_xilinx(
    struct autosa_prog *prog, struct autosa_pe_dummy_module *module,
    struct hls_info *hls)
{
  isl_printer *p;

  p = isl_printer_to_file(prog->ctx, hls->kernel_h);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_pe_dummy_module_wrapper_header_xilinx(p, prog, module);
  p = isl_printer_print_str(p, ";");
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  p = isl_printer_to_file(prog->ctx, hls->kernel_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p = print_pe_dummy_module_wrapper_header_xilinx(p, prog, module);
  p = isl_printer_end_line(p);
  isl_printer_free(p);

  return isl_stat_ok;
}

static __isl_give isl_printer *autosa_print_default_pe_dummy_module(
    __isl_take isl_printer *p,
    struct autosa_pe_dummy_module *pe_dummy_module,
    struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  /* For dummy module, we disable wrapper by default due to the relatively
   * high overheads.
   */
  bool wrapper = 0;
  struct autosa_hw_module *module = pe_dummy_module->module;
  struct print_hw_module_data hw_data = {hls, prog, module};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  /* Print core. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  if (hls->target == XILINX_HW)
    p = print_pe_dummy_module_core_headers_xilinx(p, prog,
                                                  pe_dummy_module, hls, 1);

  fprintf(hls->kernel_c, " {\n");
  if (wrapper)
    fprintf(hls->kernel_c, "#pragma HLS INLINE\n");

  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "/* Variable Declaration */"); 
  if (!prog->scop->options->autosa->use_cplusplus_template) {   
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  p = print_str_new_line(p, "/* Variable Declaration */");

  p = isl_printer_end_line(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  if (hls->target == XILINX_HW)
  {
    print_options = isl_ast_print_options_set_print_for(print_options,
                                                        &print_for_xilinx, &hw_data);
  }

  p = isl_ast_node_print(pe_dummy_module->device_tree, p, print_options);

  p = isl_printer_indent(p, -2);

  fprintf(hls->kernel_c, "}\n");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  p = isl_printer_end_line(p);

  /* Print wrapper. */
  if (wrapper) {
    if (hls->target == XILINX_HW)
    {
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "/* Module Definition */");
      p = isl_printer_end_line(p);
  
      print_pe_dummy_module_wrapper_headers_xilinx(prog, pe_dummy_module, hls);
  
      fprintf(hls->kernel_c, " {\n");
      p = isl_printer_indent(p, 2);
      p = print_pe_dummy_module_core_headers_xilinx(p, prog, pe_dummy_module, hls, 0);
      p = isl_printer_print_str(p, ";");
      p = isl_printer_end_line(p);
      p = isl_printer_indent(p, -2);
      fprintf(hls->kernel_c, "}\n");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "/* Module Definition */");
      p = isl_printer_end_line(p);
  
      p = isl_printer_end_line(p);
    }
  }

  return p;
}

struct print_db_module_while_data {
  int inter; // -1: outer 0: intra 1: inter  
  int under_if; 
  int reach_user;

  isl_printer *p_for;
  isl_printer *p_user;
  /* Outer */
  std::vector<char *> outer_for_logic;  
  std::vector<char *> outer_iterator_name;
  std::vector<char *> outer_iterator_lb;
  std::vector<char *> outer_iterator_ub;
  int outer_for_level;
  /* Inter */
  std::vector<char *> inter_for_logic;  
  std::vector<char *> inter_iterator_name;
  std::vector<char *> inter_iterator_lb;
  std::vector<char *> inter_iterator_ub;  
  int inter_for_level;
  /* Intra */
  std::vector<char *> intra_for_logic;  
  std::vector<char *> intra_iterator_name;
  std::vector<char *> intra_iterator_lb;
  std::vector<char *> intra_iterator_ub;
  int intra_for_level;
};

static __isl_give isl_printer *print_double_buffer_module_vars_while(
  __isl_take isl_printer *p, struct autosa_hw_module *module, 
  struct hls_info *hls,
  struct print_db_module_while_data *data)
{
  /* Inst ids */
  if (!module->options->autosa->use_cplusplus_template) {
    p = print_module_iterators(p, hls->kernel_c, module);
  }
  /* Local buffer */
  for (int i = 0; i < module->n_var; i++) {
    struct autosa_kernel_var *var = &module->var[i];
    p = isl_printer_start_line(p);
    if (var->n_lane == 1) 
      p = isl_printer_print_str(p, var->array->type);
    else
    {
      p = isl_printer_print_str(p, var->array->name);
      p = isl_printer_print_str(p, "_t");
      p = isl_printer_print_int(p, var->n_lane);
    }
    p = isl_printer_print_str(p, " ");
    p = isl_printer_print_str(p, var->name);
    p = isl_printer_print_str(p, "[2]");
    for (int j = 0; j < isl_vec_size(var->size); j++) {
      isl_val *v;

      p = isl_printer_print_str(p, "[");
      v = isl_vec_get_element_val(var->size, j);
      p = isl_printer_print_val(p, v);
      isl_val_free(v);
      p = isl_printer_print_str(p, "]");      
    }
    p = isl_printer_print_str(p, ";");
    p = isl_printer_end_line(p);
  }

  /* State handle variables */
  p = print_str_new_line(p, "bool arb = 0;");  
  p = print_str_new_line(p, module->in? "bool inter_trans_en = 1;" : "bool inter_trans_en = 0;");
  p = print_str_new_line(p, module->in? "bool intra_trans_en = 0;" : "bool intra_trans_en = 1;");
  p = print_str_new_line(p, module->in? "bool inter_done = 0;" : "bool inter_done = 1;");
  p = print_str_new_line(p, module->in? "bool intra_done = 1;" : "bool intra_done = 0;");
  /* Iterators */
  for (int i = 0; i < data->outer_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->outer_iterator_name[i]);
    free(data->outer_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->outer_iterator_lb[i]);
    free(data->outer_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->outer_iterator_ub[i]);
    free(data->outer_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->inter_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->inter_iterator_name[i]);
    free(data->inter_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->inter_iterator_lb[i]);
    free(data->inter_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->inter_iterator_ub[i]);
    free(data->inter_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  for (int i = 0; i < data->intra_iterator_name.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, data->intra_iterator_name[i]);
    free(data->intra_iterator_name[i]);
    p = isl_printer_print_str(p, " = ");
    p = isl_printer_print_str(p, data->intra_iterator_lb[i]);
    free(data->intra_iterator_lb[i]);
    p = isl_printer_print_str(p, "; ");
    p = isl_printer_print_str(p, "/* UB: ");
    p = isl_printer_print_str(p, data->intra_iterator_ub[i]);
    free(data->intra_iterator_ub[i]);
    p = isl_printer_print_str(p, " */");
    p = isl_printer_end_line(p);
  }
  
  p = print_str_new_line(p, "bool last_run = false;");

  return p;
}

/* Count the for level.
 */
static __isl_give isl_printer *count_module_for(__isl_take isl_printer *p,
                                                __isl_take isl_ast_print_options *print_options,
                                                __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  isl_ast_node *body;

  if (data->inter == -1)
    data->outer_for_level++;
  else if (data->inter == 0)
    data->intra_for_level++;
  else if (data->inter == 1)
    data->inter_for_level++;

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}                                                                                                

/* Count the for level. A different implementation. 
 * Currently only used for inter_trans module.
 * Since there might be if branches existing, only count one branch.
 * We assume the two branches are with the equal depth.
 */
static isl_bool count_module_for_alt(__isl_keep isl_ast_node *node, void *user) {
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  if (isl_ast_node_get_type(node) == isl_ast_node_if) {
    data->under_if = 1;
  }  

  if (isl_ast_node_get_type(node) == isl_ast_node_for) {
    if (data->under_if == 0 || (data->under_if == 1 && data->reach_user == 0)) {
      data->inter_for_level++;    
    }
  }
  if (isl_ast_node_get_type(node) == isl_ast_node_user) {
    data->reach_user = 1;
  }

  return isl_bool_true;
}

/* Extract the loop information. 
 */
static __isl_give isl_printer *extract_module_for(__isl_take isl_printer *p,
                                                  __isl_take isl_ast_print_options *print_options,
                                                  __isl_keep isl_ast_node *node, void *user)
{
  struct print_db_module_while_data *data = (struct print_db_module_while_data *)user;
  isl_ast_expr *iterator, *init, *cond, *ub;  
  const char *iterator_suffix;
  isl_printer *p_local, *p_str;  
  char *text;
  std::vector<char *> text_lines;
  isl_ast_node *body;

//  if (data->inter == -1)
//    iterator_suffix = "outer_";
//  else if (data->inter == 0)
//    iterator_suffix = "intra_";
//  else
//    iterator_suffix = "inter_";
  p_local = data->p_for;  

  /* Extract the lower bound and upper bound. */
  iterator = isl_ast_node_for_get_iterator(node);
  init = isl_ast_node_for_get_init(node);
  cond = isl_ast_node_for_get_cond(node);
  ub = isl_ast_expr_op_get_arg(cond, 1);

  p_str = isl_printer_to_str(isl_ast_node_get_ctx(node));
  p_str = isl_printer_set_output_format(p_str, ISL_FORMAT_C);
  //p_str = isl_printer_print_str(p_str, iterator_suffix);
  p_str = isl_printer_print_ast_expr(p_str, iterator);
  if (data->inter == -1)
    data->outer_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_name.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_name.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, ub);
  if (data->inter == -1)
    data->outer_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_ub.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_ub.push_back(isl_printer_get_str(p_str));
  isl_printer_flush(p_str);

  p_str = isl_printer_print_ast_expr(p_str, init);
  if (data->inter == -1)
    data->outer_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 0)
    data->intra_iterator_lb.push_back(isl_printer_get_str(p_str));
  else if (data->inter == 1)
    data->inter_iterator_lb.push_back(isl_printer_get_str(p_str));
  isl_printer_free(p_str);

  p_local = isl_printer_indent(p_local, -4);

  p_local = isl_printer_start_line(p_local);  
  //p_local = isl_printer_print_str(p_local, iterator_suffix);  
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, "++;");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_start_line(p_local);
  p_local = isl_printer_print_str(p_local, "if (");
  //p_local = isl_printer_print_str(p_local, iterator_suffix);  
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " == "); 
  p_local = isl_printer_print_ast_expr(p_local, ub);
  p_local = isl_printer_print_str(p_local, " + 1) {"); 
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  p_local = isl_printer_indent(p_local, 4);
  p_local = isl_printer_start_line(p_local);  
  //p_local = isl_printer_print_str(p_local, iterator_suffix);
  p_local = isl_printer_print_ast_expr(p_local, iterator);
  p_local = isl_printer_print_str(p_local, " = ");
  p_local = isl_printer_print_ast_expr(p_local, init);
  p_local = isl_printer_print_str(p_local, ";");
  p_local = isl_printer_end_line(p_local);
  text = isl_printer_get_str(p_local);
  text_lines.push_back(text);
  p_local = isl_printer_flush(p_local);

  if (data->inter == -1)
    data->outer_for_logic.insert(data->outer_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 0)
    data->intra_for_logic.insert(data->intra_for_logic.begin(), text_lines.begin(), text_lines.end());
  else if (data->inter == 1)
    data->inter_for_logic.insert(data->inter_for_logic.begin(), text_lines.begin(), text_lines.end());

  isl_ast_expr_free(iterator);
  isl_ast_expr_free(init);
  isl_ast_expr_free(cond);
  isl_ast_expr_free(ub);

  p_local = isl_printer_indent(p_local, -4);

  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}    

static void extract_double_buffer_module_while_data(
  struct autosa_hw_module *module, int boundary, 
  struct print_db_module_while_data *data)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = module->kernel->ctx;
  isl_printer *p_for, *p_user, *p;
  const char *for_logic, *user_logic;

  /* Outer module */
  data->inter = -1;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->outer_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);  

  /* Extract the for and user logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->outer_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->device_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_tree, p, print_options);
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Intra module */
  data->inter = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->intra_for_level = 0;

  /* Count the for level first. */
  print_options = isl_ast_print_options_alloc(ctx);  
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &count_module_for, data);
  p = isl_ast_node_print(module->intra_tree, p, print_options);  

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->intra_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);  
  p = isl_ast_node_print(module->intra_tree, p, print_options);  
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);

  /* Inter module */
  data->inter = 1;
  data->under_if = 0;
  data->reach_user = 0;
  p = isl_printer_to_str(ctx);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);
  p_for = isl_printer_to_str(ctx);
  p_for = isl_printer_set_output_format(p_for, ISL_FORMAT_C);
  p_user = isl_printer_to_str(ctx);
  p_user = isl_printer_set_output_format(p_user, ISL_FORMAT_C);
  data->p_for = p_for;
  data->p_user = p_user;
  data->inter_for_level = 0;

  /* Count the for level first. */
  if (!boundary) {
    isl_ast_node_foreach_descendant_top_down(module->inter_tree, &count_module_for_alt, data);
  } else {        
    isl_ast_node_foreach_descendant_top_down(module->boundary_inter_tree, &count_module_for_alt, data);
  }  

  /* Extract the for logic. */
  data->p_for = isl_printer_indent(data->p_for, 4 * data->inter_for_level);
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &extract_module_for, data);
  if (!boundary)
    p = isl_ast_node_print(module->inter_tree, p, print_options);
  else
    p = isl_ast_node_print(module->boundary_inter_tree, p, print_options);
  isl_printer_free(p);  
  isl_printer_free(data->p_for);
  isl_printer_free(data->p_user);
}

static __isl_give isl_printer *print_null_for(__isl_take isl_printer *p,
                                              __isl_take isl_ast_print_options *print_options,
                                              __isl_keep isl_ast_node *node, void *user)
{
  isl_ast_node *body;
  
  body = isl_ast_node_for_get_body(node);
  p = isl_ast_node_print(body, p, print_options);
  isl_ast_node_free(body);

  return p;
}    

/* Print the inter_trans module in double buffer mode. 
 */
static __isl_give isl_printer *autosa_print_inter_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, "inter_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  p = isl_ast_node_print((boundary == 0) ? module->inter_tree : module->boundary_inter_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Print the intra_trans module in double buffer mode. 
 */
static __isl_give isl_printer *autosa_print_intra_trans_module_double_buffer(
  __isl_take isl_printer *p,
  struct autosa_hw_module *module, struct autosa_prog *prog,
  struct hls_info *hls, int boundary)
{
  struct print_hw_module_data hw_data = {hls, prog, module, "intra_c"};
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_printer_get_ctx(p);

  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_module_stmt, &hw_data);
  print_options = isl_ast_print_options_set_print_for(print_options,
                                                      &print_null_for, &hw_data);

  p = isl_ast_node_print(module->intra_tree, p, print_options);
  p = isl_printer_end_line(p);

  return p;
}

/* Print the double buffer module using while loops instead of for loops.
 * First, we will change the buffer to 
 * local_buffer[2][...][...].
 * 
 * Specifically, when handling a code structure:
 * [outer for loops]
 * for ...
 *   for ...
 * [outer for loops]
 * { 
 *   if (arb) {
 *     ld(local_buffer_ping, ld_en);
 *     st(local_buffer_pong, st_en);
 *   else {
 *     ld(local_buffer_pong, ld_en);
 *     st(local_buffer_ping, st_en);
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   [state handle logic]
 * }
 * [last batch]
 * if (arb) {
 *   st(local_buffer_pong, st_en);
 * } else {
 *   st(local_buffer_ping, st_en);
 * }
 * [last batch]
 * We will convert it to a new code structure:
 * while (1) {
 *   if (ld_en) {
 *     [inlined logic]
 *     ld(local_buffer[arb][...]);
 *     [inlined logic]
 *   } 
 *   if (st_en) {
 *     [inlined logic]
 *     st(local_buffer[!arb][...]);
 *     [inlined logic]
 *   }
 *   [state handle logic]
 *   arb = !arb;
 *   ld_en = 1;
 *   st_en = 1;
 *   [state handle logic]
 *   [outer for loops]
 *   outer_iter0++;
 *   if (outer_iter0 == ...) {
 *     outer_iter0 = 0;
 *     [last batch]
 *     ld_en = 0;
 *     [last batch]
 *   }
 *   [outer for loops]
 * }
 * 
 * Note that this only works if each for loop structure is a perfectly 
 * nested loop so that we could convert to a while loop.
 */
static __isl_give isl_printer *print_double_buffer_module_while(
  __isl_take isl_printer *p, struct autosa_hw_module *module,
  struct autosa_prog *prog, struct hls_info *hls, int boundary)
{
  if (!boundary) {
    if (!module->device_tree)
      return p;    
  } else {
    if (!module->boundary_tree)
      return p;
  }

  struct print_db_module_while_data print_data;

  /* Extract the code snippets. */
  extract_double_buffer_module_while_data(module, boundary, &print_data);

  /* Print header */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  print_module_headers_xilinx(prog, module, hls, -1, boundary);
  p = print_str_new_line(p, "{");
  p = isl_printer_indent(p, 2);

  /* Print variables */
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = print_double_buffer_module_vars_while(p, module, hls, &print_data);
  p = print_str_new_line(p, "/* Variable Declaration */");
  p = isl_printer_end_line(p);

  /* Print content */
  p = print_str_new_line(p, "while (1) {");
  p = print_str_new_line(p, "#pragma HLS PIPELINE II=1");
  p = isl_printer_indent(p, 2);
  
  /* Print inter_trans */
  p = print_str_new_line(p, "if (inter_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_inter_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */  
  for (int i = 0; i < print_data.inter_for_logic.size(); i++) {    
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.inter_for_logic[i]);
    free(print_data.inter_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.inter_for_level);
  p = print_str_new_line(p, "inter_done = 1;");
  p = print_str_new_line(p, "inter_trans_en = 0;");
  for (int i = 0; i < print_data.inter_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }
  
  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print intra_trans */
  p = print_str_new_line(p, "if (intra_trans_en) {");
  p = isl_printer_indent(p, 2);
  /* Print the module logic */
  p = autosa_print_intra_trans_module_double_buffer(p, module, prog, hls, boundary);
  /* Print the loop counter */
  for (int i = 0; i < print_data.intra_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.intra_for_logic[i]);
    free(print_data.intra_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.intra_for_level);
  p = print_str_new_line(p, "intra_done = 1;");
  p = print_str_new_line(p, "intra_trans_en = 0;");
  for (int i = 0; i < print_data.intra_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  /* Print state_handle */
  p = print_str_new_line(p, "if (inter_done && intra_done) {");
  p = isl_printer_indent(p, 2);
  p = print_str_new_line(p, "if (last_run) break;");
  p = print_str_new_line(p, "intra_trans_en = 1;");
  p = print_str_new_line(p, "inter_trans_en = 1;");
  p = print_str_new_line(p, "intra_done = 0;");
  p = print_str_new_line(p, "inter_done = 0;");
  p = print_str_new_line(p, "arb = !arb;");
  /* Print the loop counter */
  for (int i = 0; i < print_data.outer_for_logic.size(); i++) {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, print_data.outer_for_logic[i]);
    free(print_data.outer_for_logic[i]);
  }
  p = isl_printer_indent(p, 4 * print_data.outer_for_level);
  p = print_str_new_line(p, module->in? "inter_trans_en = 0;" : "intra_trans_en = 0;");
  p = print_str_new_line(p, module->in? "inter_done = 1;" : "intra_done = 1;");
  p = print_str_new_line(p, "last_run = true;");
  for (int i = 0; i < print_data.outer_for_level; i++) {
    p = isl_printer_indent(p, -2);
    p = print_str_new_line(p, "}");
  }

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");

  p = isl_printer_indent(p, -2);
  p = print_str_new_line(p, "}");
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "/* Module Definition */");
  p = isl_printer_end_line(p);

  /* If the module serialization is enabled, we will print out an extra module
   * for serializing the data. */
  if (module->to_mem && module->options->autosa->host_serialize) {
    p = autosa_print_serialize_module(p, module, prog, hls, boundary);
  }

  return p;
}

static __isl_give isl_printer *autosa_print_host_code(__isl_take isl_printer *p,
                                                      struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
                                                      struct autosa_hw_module **modules, int n_modules,
                                                      struct autosa_hw_top_module *top,
                                                      struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
                                                      struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(tree);
  struct print_host_user_data data = {hls, prog, top};
  struct print_hw_module_data hw_data = {hls, prog, NULL};
  isl_printer *p_module;

  /* Print the data pack types in the program. */
  print_data_types_xilinx(top, hls);

  /* Print the macros for sparse data structure */
  if (prog->scop->options->autosa->block_sparse) {
    print_sparse_macros(top->kernel, hls);
  }

  /* Print the helper functions in the program. */
  print_drain_merge_funcs(top->kernel, drain_merge_funcs, n_drain_merge_funcs, hls);

  /* Print the host data serialization function. */
  print_host_serialize_funcs(top->kernel, modules, n_modules, hls); // TODO

  /* Print the default AST. */
  print_options = isl_ast_print_options_alloc(ctx);
  print_options = isl_ast_print_options_set_print_user(print_options,
                                                       &print_host_user_xilinx, &data);

  /* Print the macros definitions in the program. */
  p = autosa_print_macros(p, tree);
  p = isl_ast_node_print(tree, p, print_options);

  /* Print the hw module ASTs. */
  p_module = isl_printer_to_file(ctx, hls->kernel_c);
  p_module = isl_printer_set_output_format(p_module, ISL_FORMAT_C);

  for (int i = 0; i < n_modules; i++)
  {
    //std::cout << modules[i]->name << " " << module->device_tree << std::endl;
    if (modules[i]->double_buffer && modules[i]->options->autosa->double_buffer_style == 0) 
    {
      p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 0);
      if (modules[i]->boundary) {
        p_module = print_double_buffer_module_while(p_module, modules[i], prog, hls, 1);
      }
    } else {
      if (modules[i]->is_filter && modules[i]->is_buffer)
      {
        /* Print out the definitions for inter_trans and intra_trans function calls. */
        /* Intra transfer function */
        p_module = autosa_print_intra_trans_module(p_module, modules[i], prog, hls, 0);
  
        /* Inter transfer function */
        p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 0);
        if (modules[i]->boundary)
          p_module = autosa_print_inter_trans_module(p_module, modules[i], prog, hls, 1);
      }

      p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 0);
  
      if (modules[i]->boundary)
      {
        /* Print out the definitions for boundary trans function calls. */
        p_module = autosa_print_default_module(p_module, modules[i], prog, hls, 1);
      }

      if (modules[i]->n_pe_dummy_modules > 0)
      {
        /* Print out the definitions for pe dummy function calls. */
        for (int j = 0; j < modules[i]->n_pe_dummy_modules; j++)
        {
          p_module = autosa_print_default_pe_dummy_module(
              p_module, modules[i]->pe_dummy_modules[j], prog, hls, 0);
        }
      }
    }
  }
  isl_printer_free(p_module);

  return p;
}

/* Declare the AXI interface for each global pointers. 
 */
static __isl_give isl_printer *print_top_module_interface_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_kernel *kernel)
{
  int n;
  unsigned nparam;
  isl_space *space;
  const char *type;

  for (int i = 0; i < kernel->n_array; ++i)
  {
    struct autosa_local_array_info *local_array = &kernel->array[i];
    if (autosa_kernel_requires_array_argument(kernel, i) && !autosa_array_is_scalar(local_array->array))
    {
      if (local_array->n_io_group_refs > 1)
      {
        for (int j = 0; j < local_array->n_io_group_refs; j++)
        {
          p = print_str_new_line(p, "p = isl_printer_start_line(p);");
          p = isl_printer_start_line(p);
          if (prog->scop->options->autosa->axi_stream) {
            p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE axis port=fifo_");
            p = isl_printer_print_str(p, local_array->array->name);
            p = isl_printer_print_str(p, "_");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, " bundle=gmem_");
            p = isl_printer_print_str(p, local_array->array->name);
            p = isl_printer_print_str(p, "_");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, "\");");            
          } else {
            p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE m_axi port=");
            p = isl_printer_print_str(p, local_array->array->name);
            p = isl_printer_print_str(p, "_");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, " offset=slave bundle=gmem_");
            p = isl_printer_print_str(p, local_array->array->name);
            p = isl_printer_print_str(p, "_");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, "\");");
          }
          p = isl_printer_end_line(p);          
          p = print_str_new_line(p, "p = isl_printer_end_line(p);");
        }
      }
      else
      {
        p = print_str_new_line(p, "p = isl_printer_start_line(p);");
        p = isl_printer_start_line(p);
        if (prog->scop->options->autosa->axi_stream) {
          p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE axis port=fifo_");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, " bundle=gmem_");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, "\");");
        } else {
          p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE m_axi port=");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, " offset=slave bundle=gmem_");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, "\");");          
        }
        p = isl_printer_end_line(p);
        p = print_str_new_line(p, "p = isl_printer_end_line(p);");
      }
    }
  }

  if (!prog->scop->options->autosa->axi_stream) {
    for (int i = 0; i < kernel->n_array; ++i)
    {
      struct autosa_local_array_info *local_array = &kernel->array[i];
      if (autosa_kernel_requires_array_argument(kernel, i))
      {
        if (local_array->n_io_group_refs > 1)
        {
          for (int j = 0; j < local_array->n_io_group_refs; j++)
          {
            p = print_str_new_line(p, "p = isl_printer_start_line(p);");
            p = isl_printer_start_line(p);
            p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE s_axilite port=");
            p = isl_printer_print_str(p, local_array->array->name);
            p = isl_printer_print_str(p, "_");
            p = isl_printer_print_int(p, j);
            p = isl_printer_print_str(p, " bundle=control\");");
            p = isl_printer_end_line(p);
            p = print_str_new_line(p, "p = isl_printer_end_line(p);");
          }
        }
        else
        {
          p = print_str_new_line(p, "p = isl_printer_start_line(p);");
          p = isl_printer_start_line(p);
          p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE s_axilite port=");
          p = isl_printer_print_str(p, local_array->array->name);
          p = isl_printer_print_str(p, " bundle=control\");");
          p = isl_printer_end_line(p);
          p = print_str_new_line(p, "p = isl_printer_end_line(p);");
        }
      }
    }
  }

  space = isl_union_set_get_space(kernel->arrays);
  nparam = isl_space_dim(space, isl_dim_param);
  for (int i = 0; i < nparam; i++)
  {
    const char *name;
    name = isl_space_get_dim_name(space, isl_dim_param, i);
    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE s_axilite port=");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, " bundle=control\");");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  }
  isl_space_free(space);

  n = isl_space_dim(kernel->space, isl_dim_set);
  type = isl_options_get_ast_iterator_type(prog->ctx);
  for (int i = 0; i < n; i++)
  {
    const char *name;
    name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE s_axilite port=");
    p = isl_printer_print_str(p, name);
    p = isl_printer_print_str(p, " bundle=control\");");
    p = isl_printer_end_line(p);
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  }

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"#pragma HLS INTERFACE s_axilite port=return bundle=control\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  return p;
}

static __isl_give isl_printer *print_top_module_headers_xilinx(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, struct autosa_hw_top_module *top, struct hls_info *hls)
{
  struct autosa_kernel *kernel = top->kernel;

  if (!hls->hls)
  {
    p = print_str_new_line(p, "p = isl_printer_start_line(p);");
    p = print_str_new_line(p, "p = isl_printer_print_str(p, \"extern \\\"C\\\" {\");");
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  }

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");

  p = isl_printer_start_line(p);
  if (prog->scop->options->autosa->hcl) {
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"void autosa_func");
  } else {
    p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"void kernel");
    //p = isl_printer_print_int(p, top->kernel->id);
    p = isl_printer_print_int(p, 0);
  }
  p = isl_printer_print_str(p, "(");
  p = print_kernel_arguments(p, prog, top->kernel, 1, hls);
  p = isl_printer_print_str(p, ")\");");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"{\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  /* Print out the interface pragmas. */
  if (!prog->scop->options->autosa->hcl) {
    p = print_top_module_interface_xilinx(p, prog, kernel);
    p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  }

  /* Print out the dataflow pragma. */  
  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"#pragma HLS DATAFLOW\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");

  return p;
}

static char *extract_fifo_name_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static char *extract_fifo_width_from_fifo_decl_name(isl_ctx *ctx, char *fifo_decl_name)
{
  int loc = 0;
  char ch;
  isl_printer *p_str = isl_printer_to_str(ctx);
  char *name = NULL;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    if (ch == '.')
      break;
    loc++;
  }

  loc++;

  while ((ch = fifo_decl_name[loc]) != '\0')
  {
    char buf[2];
    buf[0] = ch;
    buf[1] = '\0';
    p_str = isl_printer_print_str(p_str, buf);
    loc++;
  }

  name = isl_printer_get_str(p_str);
  isl_printer_free(p_str);

  return name;
}

static __isl_give isl_printer *print_top_module_fifo_stmt(__isl_take isl_printer *p,
                                                          __isl_take isl_ast_print_options *print_options,
                                                          __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_FIFO_DECL:
    return autosa_kernel_print_fifo_decl(p, stmt, data->prog, data->hls);
  }

  return p;
}

static __isl_give isl_printer *print_top_module_call_stmt(
  __isl_take isl_printer *p,
  __isl_take isl_ast_print_options *print_options,
  __isl_keep isl_ast_node *node, void *user)
{
  isl_id *id;
  struct autosa_kernel_stmt *stmt;
  struct print_hw_module_data *data = (struct print_hw_module_data *)(user);

  id = isl_ast_node_get_annotation(node);
  stmt = (struct autosa_kernel_stmt *)isl_id_get_user(id);
  isl_id_free(id);

  isl_ast_print_options_free(print_options);

  switch (stmt->type)
  {
  case AUTOSA_KERNEL_STMT_MODULE_CALL:
    return autosa_kernel_print_module_call(p, stmt, data->prog, data->hls->target);
  }

  return p;
}

/* This function prints the code that prints out the top function that 
 * calls the hardware modules and declares the fifos.
 */
static void print_top_gen_host_code(
    struct autosa_prog *prog, __isl_keep isl_ast_node *node,
    struct autosa_hw_top_module *top, struct hls_info *hls)
{
  isl_ast_print_options *print_options;
  isl_ctx *ctx = isl_ast_node_get_ctx(node);
  isl_printer *p;
  int fifo_depth = prog->scop->options->autosa->fifo_depth;
  struct print_hw_module_data hw_data = {hls, prog, NULL};

  /* Print the top module ASTs. */
  p = isl_printer_to_file(ctx, hls->top_gen_c);
  p = isl_printer_set_output_format(p, ISL_FORMAT_C);

  print_top_gen_headers(prog, top, hls);
  fprintf(hls->top_gen_c, " {\n");
  p = isl_printer_indent(p, 2);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *fd = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/resource_est/design_info.dat\", \"w\");");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int fifo_cnt;");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx *ctx = isl_ctx_alloc();");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer *p = isl_printer_to_file(ctx, f);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  if (hls->target == XILINX_HW)
    p = print_top_module_headers_xilinx(p, prog, top, hls);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, 2);");
  p = isl_printer_end_line(p);

  /* Print FIFO declarations */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* Print the serialize fifos if existing. */
  for (int i = 0; i < top->n_hw_modules; i++) {
    struct autosa_hw_module *module = top->hw_modules[i];
    struct autosa_array_ref_group *group = module->io_groups[0];
    if (module->is_serialized) {
      /* Generate fifo decl counter. */
      char *fifo_name;
      int fifo_w;  // bytes
      fifo_w = module->data_pack_inter * group->array->size;
      isl_printer *p_str;
      p_str = isl_printer_to_str(ctx);
      p_str = autosa_array_ref_group_print_fifo_name(group, p_str);
      p_str = isl_printer_print_str(p_str, "_");
      p_str = isl_printer_print_str(p_str, module->name);
      p_str = isl_printer_print_str(p_str, "_serialize");
      fifo_name = isl_printer_get_str(p_str);
      isl_printer_free(p_str);

      p = print_str_new_line(p, "fifo_cnt = 1;");
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* ");
      p = isl_printer_print_str(p, module->name);
      p = isl_printer_print_str(p, "_serialize fifo */ ");      
      p = print_fifo_type_xilinx(p, group, module->data_pack_inter);
      p = isl_printer_print_str(p, " ");
      p = isl_printer_print_str(p, fifo_name);      
      p = isl_printer_print_str(p, ";\");");
      p = isl_printer_end_line(p);
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");

      /* Resource pragma */
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS STREAM variable=");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, "\");");
      p = isl_printer_end_line(p);
      //p = print_str_new_line(p, "p = isl_printer_print_str(p, \" depth=2\");");
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \" depth=");
      p = isl_printer_print_int(p, fifo_depth);
      p = isl_printer_print_str(p, "\");");
      p = isl_printer_end_line(p);

      p = print_str_new_line(p, "p = isl_printer_end_line(p);");

      if (group->local_array->is_sparse) {
        p = print_str_new_line(p, "p = isl_printer_start_line(p);");
        p = isl_printer_start_line(p);
        p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"#pragma HLS DATA_PACK variable=");
        p = isl_printer_print_str(p, fifo_name);
        p = isl_printer_print_str(p, "\");");
        p = isl_printer_end_line(p);
        p = print_str_new_line(p, "p = isl_printer_end_line(p);");
      }

      /* fifo:fifo_name:fifo_cnt:fifo_width */
      p = isl_printer_start_line(p);
      p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
      p = isl_printer_print_str(p, fifo_name);
      p = isl_printer_print_str(p, ":\%d:");
      p = isl_printer_print_int(p, fifo_w);
      p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
      p = isl_printer_end_line(p);

      p = isl_printer_end_line(p);      
      free(fifo_name);
    }
  }

  for (int i = 0; i < top->n_fifo_decls; i++) {
    /* Generate fifo decl counter. */
    char *fifo_decl_name = top->fifo_decl_names[i];
    char *fifo_name = extract_fifo_name_from_fifo_decl_name(ctx, fifo_decl_name);
    char *fifo_w = extract_fifo_width_from_fifo_decl_name(ctx, fifo_decl_name);
    p = print_str_new_line(p, "fifo_cnt = 0;");

    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_fifo_stmt, &hw_data);

    p = isl_ast_node_print(top->fifo_decl_wrapped_trees[i],
                           p, print_options);

    /* fifo:fifo_name:fifo_cnt:fifo_width */
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"fifo:");
    p = isl_printer_print_str(p, fifo_name);
    p = isl_printer_print_str(p, ":\%d:");
    p = isl_printer_print_str(p, fifo_w);
    p = isl_printer_print_str(p, "\\n\", fifo_cnt);");
    p = isl_printer_end_line(p);

    p = isl_printer_end_line(p);

    free(fifo_name);
    free(fifo_w);
  }

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_start_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_print_str(p, \"/* FIFO Declaration */\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_end_line(p);");
  p = isl_printer_end_line(p);

  int n_module_names = 0;
  char **module_names = NULL;
  for (int i = 0; i < top->n_hw_modules; i++)
  {
    /* Generate module call counter. */
    struct autosa_hw_module *module = top->hw_modules[i];
    char *module_name;

    if (module->is_filter && module->is_buffer)
    {
      module_name = concat(ctx, module->name, "intra_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      module_name = concat(ctx, module->name, "inter_trans");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;

      if (module->boundary)
      {
        module_name = concat(ctx, module->name, "inter_trans_boundary");

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    module_name = strdup(module->name);

    n_module_names++;
    module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
    module_names[n_module_names - 1] = module_name;

    if (module->boundary)
    {
      module_name = concat(ctx, module->name, "boundary");

      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }

    if (module->n_pe_dummy_modules > 0)
    {
      for (int j = 0; j < module->n_pe_dummy_modules; j++)
      {
        struct autosa_pe_dummy_module *dummy_module = module->pe_dummy_modules[j];
        struct autosa_array_ref_group *group = dummy_module->io_group;
        isl_printer *p_str = isl_printer_to_str(ctx);
        p_str = autosa_array_ref_group_print_prefix(group, p_str);
        p_str = isl_printer_print_str(p_str, "_PE_dummy");
        p_str = isl_printer_print_str(p_str, dummy_module->in? "_in" : "_out");
        module_name = isl_printer_get_str(p_str);
        isl_printer_free(p_str);

        n_module_names++;
        module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
        module_names[n_module_names - 1] = module_name;
      }
    }

    if (module->is_serialized) { 
      if (module->boundary)      
        module_name = concat(ctx, module->name, "boundary_serialize");
      else
        module_name = concat(ctx, module->name, "serialize");
      
      n_module_names++;
      module_names = (char **)realloc(module_names, n_module_names * sizeof(char *));
      module_names[n_module_names - 1] = module_name;
    }
  }
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "int ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt = 0;");
    p = isl_printer_end_line(p);
  }

  /* Print module calls. */
  for (int i = 0; i < top->n_module_calls; i++)
  {
    /* Print AST */
    print_options = isl_ast_print_options_alloc(ctx);
    print_options = isl_ast_print_options_set_print_user(print_options,
                                                         &print_top_module_call_stmt, &hw_data);    

    p = isl_ast_node_print(top->module_call_wrapped_trees[i],
                           p, print_options);
  }

  /* module:module_name:module_cnt. */
  for (int i = 0; i < n_module_names; i++)
  {
    p = isl_printer_start_line(p);
    p = isl_printer_print_str(p, "fprintf(fd, \"module:");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, ":\%d\\n\", ");
    p = isl_printer_print_str(p, module_names[i]);
    p = isl_printer_print_str(p, "_cnt);");
    p = isl_printer_end_line(p);
  }
  p = isl_printer_end_line(p);

  for (int i = 0; i < n_module_names; i++)
  {
    free(module_names[i]);
  }
  free(module_names);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "p = isl_printer_indent(p, -2);");
  p = isl_printer_end_line(p);

  p = print_str_new_line(p, "p = isl_printer_start_line(p);");
  p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
  p = print_str_new_line(p, "p = isl_printer_end_line(p);");
  if (hls->target == XILINX_HW)
  {
    if (!hls->hls)
    {
      p = print_str_new_line(p, "p = isl_printer_start_line(p);");
      p = print_str_new_line(p, "p = isl_printer_print_str(p, \"}\");");
      p = print_str_new_line(p, "p = isl_printer_end_line(p);");
    }
  }

  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "fclose(fd);");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_printer_free(p);");
  p = isl_printer_end_line(p);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "isl_ctx_free(ctx);");
  p = isl_printer_end_line(p);
  p = isl_printer_indent(p, -2);
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "}");
  p = isl_printer_end_line(p);
  p = isl_printer_end_line(p);

  /* For internal testing only. */
  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "int main()");
  p = isl_printer_end_line(p);

  p = ppcg_start_block(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "FILE *f = fopen(\"");
  p = isl_printer_print_str(p, hls->output_dir);
  p = isl_printer_print_str(p, "/src/top.cpp\", \"w\");");
  p = isl_printer_end_line(p);

  p = isl_printer_start_line(p);
  p = isl_printer_print_str(p, "top_generate(f);");
  p = isl_printer_end_line(p);

  p = ppcg_end_block(p);
  p = isl_printer_free(p);

  return;
}

/* Given a autosa_prog "prog" and the corresponding tranformed AST
 * "tree", print the entire OpenCL/HLS code to "p".
 * "types" collects the types for which a definition has already been
 * printed.
 */
static __isl_give isl_printer *print_hw(
    __isl_take isl_printer *p,
    struct autosa_prog *prog, __isl_keep isl_ast_node *tree,
    struct autosa_hw_module **modules, int n_modules,
    struct autosa_hw_top_module *top_module,
    struct autosa_drain_merge_func **drain_merge_funcs, int n_drain_merge_funcs,
    struct autosa_types *types, void *user)
{
  struct hls_info *hls = (struct hls_info *)user;
  isl_printer *p_tmp;

  p_tmp = isl_printer_to_file(isl_printer_get_ctx(p), hls->kernel_c);
  p_tmp = isl_printer_set_output_format(p_tmp, ISL_FORMAT_C);
  p_tmp = autosa_print_types(p_tmp, types, prog);
  p_tmp = isl_printer_free(p_tmp);  

  /* Print OpenCL host and kernel function. */
  p = autosa_print_host_code(p, prog, tree, modules, n_modules, top_module,
                             drain_merge_funcs, n_drain_merge_funcs, hls);
  /* Print seperate top module code generation function. */
  print_top_gen_host_code(prog, tree, top_module, hls);

  return p;
}

/* Generate systolic arrays on Xilinx FPGAs.
 */
int generate_autosa_xilinx_hls_c(isl_ctx *ctx, struct ppcg_options *options,
                                 const char *input)
{
  struct hls_info hls;
  int r;

  hls.target = XILINX_HW;
  hls.hls = options->autosa->hls;
  hls.ctx = ctx;
  hls.output_dir = options->autosa->output_dir;
  hls.hcl = options->autosa->hcl;
  hls_open_files(&hls, input);

  r = generate_sa(ctx, input, hls.host_c, options, &print_hw, &hls);

  hls_close_files(&hls);

  return r;
}


================================================
FILE: src/autosa_xilinx_hls_c.h
================================================
#ifndef _AUTOSA_XILINX_HLS_C_H
#define _AUTOSA_XILINX_HLS_C_H

#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

int generate_autosa_xilinx_hls_c(isl_ctx *ctx, struct ppcg_options *options,
																	 const char *input);

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: src/configure.ac
================================================
AC_INIT([autosa], [0.02], [jiewang@cs.ucla.edu])
AC_CONFIG_AUX_DIR([build])
AC_CONFIG_MACRO_DIR([m4])
AM_INIT_AUTOMAKE([foreign subdir-objects])
m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])

AC_PROG_CC
AC_PROG_CXX
AC_PROG_LIBTOOL
PKG_PROG_PKG_CONFIG

# AX_CHECK_OPENMP
# AX_CHECK_OPENCL
# if test $HAVE_OPENCL = yes; then
# 	extra_tests="$extra_tests opencl_test.sh"
# fi

AX_SUBMODULE(isl,build|bundled|system,bundled)
AM_CONDITIONAL(BUNDLED_ISL, test $with_isl = bundled)
AM_CONDITIONAL(BUILD_ISL, test $with_isl = build)

AC_SUBST(ISL_CFLAGS)
AC_SUBST(ISL_LIBS)
AC_SUBST(ISL_SRCDIR)
AC_SUBST(ISL_BUILDDIR)
case "$with_isl" in
bundled)
	ISL_CFLAGS="-I\$(top_srcdir)/isl/include -I\$(top_builddir)/isl/include"
	ISL_CFLAGS="$ISL_CFLAGS"
  ISL_SRCDIR="$srcdir/isl"
  ISL_BUILDDIR=isl
	ppcg_configure_args="$ppcg_configure_args --with-isl-builddir=../isl"
	ppcg_configure_args="$ppcg_configure_args --with-isl=build"
	#ppcg_configure_args="$ppcg_configure_args --with-clang=system"
	ppcg_configure_args="$ppcg_configure_args --with-clang=no"
  PACKAGE_CFLAGS_ISL='-I${prefix}/include'
	;;
build)
  ISL_SRCDIR="$isl_srcdir"
	ISL_BUILDDIR=`echo @abs_builddir@ | $with_isl_builddir/config.status --file=-`
	ISL_CFLAGS="-I$isl_srcdir/include -I$ISL_BUILDDIR/include"
	ISL_CFLAGS="$ISL_CFLAGS"
	ISL_LIBS="$with_isl_builddir/libisl.la"
  PACKAGE_CFLAGS_ISL='-I${prefix}/include'
	;;
system)
	PKG_CHECK_MODULES([ISL], [isl])
  PACKAGE_CFLAGS_ISL="$ISL_CFLAGS"
  ;;
esac
AM_CONDITIONAL(HAVE_ISL_BUILDDIR, test "x$ISL_BUILDDIR" != "x")

AX_SUBMODULE(barvinok,bundled|system,bundled)
AM_CONDITIONAL(BUNDLED_BARVINOK, test $with_barvinok = bundled)
AM_CONDITIONAL(BUILD_BARVINOK, test $with_barvinok = build)

AC_SUBST(BARVINOK_CFLAGS)
AC_SUBST(BARVINOK_LIBS)
AC_SUBST(BARVINOK_SRCDIR)
AC_SUBST(BARVINOK_BUILDDIR)
case "$with_barvinok" in
bundled)
  BARVINOK_CFLAGS="$BARVINOK_CFLAGS -I\$(top_srcdir)/barvinok -I\$(top_builddir)/barvinok"
  BARVINOK_CFLAGS="$BARVINOK_CFLAGS"
  BARVINOK_SRCDIR="$srcdir/barvinok"
  BARVINOK_BUILDDIR=barvinok
  ;;
build)
  BARVINOK_SRCDIR="$barvinok_srcdir"
  BARVINOK_CFLAGS="$BARVINOK_CFLAGS"
  BARVINOK_BUILDDIR=`echo @abs_builddir@ | $with_BARVINOK_builddir/config.status --file=-`
  BARVINOK_CFLAGS="-I$barvinok_srcdir/ -I$BARVINOK_BUILDDIR/"
  BARVINOK_LIBS="$with_barvinok_builddir/libisl.la"
  ;;
system)
  PKG_CHECK_MODULES([BARVINOK], [barvinok])
  PACKAGE_CFLAGS_BARVINOK="$BARVINOK_CFLAGS"
  ;;
esac
AM_CONDITIONAL(HAVE_BARVINOK_BUILDDIR, test "x$BARVINOK_BUILDDIR" != "x")

AX_SUBMODULE(pet,bundled|system,bundled)
AM_CONDITIONAL(BUNDLED_PET, test $with_pet = bundled)
AM_CONDITIONAL(BUILD_PET, test $with_pet = build)

AC_SUBST(PET_CFLAGS)
AC_SUBST(PET_LIBS)
AC_SUBST(PET_BUILDDIR)
case "$with_pet" in
bundled)
	PET_CFLAGS="$PET_CFLAGS -I\$(top_srcdir)/pet/include"
	;;
build)
  PET_BUILDDIR=`echo @abs_builddir@ | $with_pet_builddir/config.status --file=-`
  PET_CFLAGS="-I$pet_srcdir/include"
  ;;
system)
	PKG_CHECK_MODULES([PET], [pet])
  PACKAGE_CFLAGS_PET="$PET_CFLAGS"
	;;
esac

# AC_SUBST(POLYBENCH_DIR)
# AC_SUBST(extra_tests)
# AC_ARG_WITH([polybench],
# 	[AS_HELP_STRING([--with-polybench=DIR], [PolyBench location])],
# 	[
# 	if test -f "$with_polybench/utilities/benchmark_list"; then
# 		POLYBENCH_DIR=$with_polybench
# 		extra_tests="$extra_tests polybench_test.sh"
# 	fi
# 	])

# AX_DETECT_GIT_HEAD

AC_CONFIG_FILES(Makefile)
# AC_CONFIG_FILES([polybench_test.sh], [chmod +x polybench_test.sh])
# AC_CONFIG_FILES([opencl_test.sh], [chmod +x opencl_test.sh])
if test $with_isl = bundled; then
	AC_CONFIG_SUBDIRS(isl)
fi
if test $with_barvinok = bundled; then
  AC_CONFIG_SUBDIRS(barvinok)
fi
if test $with_pet = bundled; then
	AC_CONFIG_SUBDIRS(pet)
fi
AC_CONFIG_COMMANDS_POST([
	dnl pass on arguments to subdir configures, but don't
	dnl add them to config.status
	ac_configure_args="$ac_configure_args $ppcg_configure_args"
])
AC_OUTPUT


================================================
FILE: src/cpu.c
================================================
/*
 * Copyright 2012 INRIA Paris-Rocquencourt
 * Copyright 2012 Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Tobias Grosser, INRIA Paris-Rocquencourt,
 * Domaine de Voluceau, Rocquenqourt, B.P. 105,
 * 78153 Le Chesnay Cedex France
 * and Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <limits.h>
#include <stdio.h>
#include <string.h>

#include <isl/aff.h>
#include <isl/ctx.h>
#include <isl/flow.h>
#include <isl/map.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <pet.h>

#include "ppcg.h"
#include "ppcg_options.h"
#include "cpu.h"
#include "print.h"
#include "schedule.h"
#include "util.h"

/* Representation of a statement inside a generated AST.
 *
 * "stmt" refers to the original statement.
 * "ref2expr" maps the reference identifier of each access in
 * the statement to an AST expression that should be printed
 * at the place of the access.
 */
struct ppcg_stmt {
	struct pet_stmt *stmt;

	isl_id_to_ast_expr *ref2expr;
};

static void ppcg_stmt_free(void *user)
{
	struct ppcg_stmt *stmt = user;

	if (!stmt)
		return;

	isl_id_to_ast_expr_free(stmt->ref2expr);

	free(stmt);
}

/* Derive the output file name from the input file name.
 * 'input' is the entire path of the input file. The output
 * is the file name plus the additional extension.
 *
 * We will basically replace everything after the last point
 * with '.ppcg.c'. This means file.c becomes file.ppcg.c
 */
static FILE *get_output_file(const char *input, const char *output)
{
	char name[PATH_MAX];
	const char *ext;
	const char ppcg_marker[] = ".ppcg";
	int len;
	FILE *file;

	len = ppcg_extract_base_name(name, input);

	strcpy(name + len, ppcg_marker);
	ext = strrchr(input, '.');
	strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c");

	if (!output)
		output = name;

	file = fopen(output, "w");
	if (!file) {
		fprintf(stderr, "Unable to open '%s' for writing\n", output);
		return NULL;
	}

	return file;
}

/* Data used to annotate for nodes in the ast.
 */
struct ast_node_userinfo {
	/* The for node is an openmp parallel for node. */
	int is_openmp;
};

/* Information used while building the ast.
 */
struct ast_build_userinfo {
	/* The current ppcg scop. */
	struct ppcg_scop *scop;

	/* Are we currently in a parallel for loop? */
	int in_parallel_for;

	/* The contraction of the entire schedule tree. */
	isl_union_pw_multi_aff *contraction;
};

/* Check if the current scheduling dimension is parallel.
 *
 * We check for parallelism by verifying that the loop does not carry any
 * dependences.
 *
 * If any expansion nodes are present in the schedule tree,
 * then they are assumed to be situated near the leaves of the schedule tree,
 * underneath any node that may result in a for loop.
 * In particular, these expansions may have been introduced
 * by the call to isl_schedule_expand inside ppcg_compute_grouping_schedule.
 * The dependence relations are formulated in terms of the expanded
 * domains, while, by assumption, the partial schedule returned
 * by isl_ast_build_get_schedule refers to the contracted domains.
 * Plug in the contraction such that the schedule would also
 * refer to the expanded domains.
 * Note that if the schedule tree does not contain any expansions,
 * then the contraction is an identity function.
 *
 * If the live_range_reordering option is set, then this currently
 * includes the order dependences.  In principle, non-zero order dependences
 * could be allowed, but this would require privatization and/or expansion.
 *
 * Parallelism test: if the distance is zero in all outer dimensions, then it
 * has to be zero in the current dimension as well.
 * Implementation: first, translate dependences into time space, then force
 * outer dimensions to be equal.  If the distance is zero in the current
 * dimension, then the loop is parallel.
 * The distance is zero in the current dimension if it is a subset of a map
 * with equal values for the current dimension.
 */
static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
	struct ast_build_userinfo *build_info)
{
	struct ppcg_scop *scop = build_info->scop;
	isl_union_map *schedule, *deps;
	isl_map *schedule_deps, *test;
	isl_space *schedule_space;
	unsigned i, dimension, is_parallel;

	schedule = isl_ast_build_get_schedule(build);
	schedule = isl_union_map_preimage_domain_union_pw_multi_aff(schedule,
		isl_union_pw_multi_aff_copy(build_info->contraction));
	schedule_space = isl_ast_build_get_schedule_space(build);

	dimension = isl_space_dim(schedule_space, isl_dim_out) - 1;

	deps = isl_union_map_copy(scop->dep_flow);
	deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false));
	if (scop->options->live_range_reordering) {
		isl_union_map *order = isl_union_map_copy(scop->dep_order);
		deps = isl_union_map_union(deps, order);
	}
	deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule));
	deps = isl_union_map_apply_domain(deps, schedule);

	if (isl_union_map_is_empty(deps)) {
		isl_union_map_free(deps);
		isl_space_free(schedule_space);
		return 1;
	}

	schedule_deps = isl_map_from_union_map(deps);

	for (i = 0; i < dimension; i++)
		schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i,
					       isl_dim_in, i);

	test = isl_map_universe(isl_map_get_space(schedule_deps));
	test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in,
			      dimension);
	is_parallel = isl_map_is_subset(schedule_deps, test);

	isl_space_free(schedule_space);
	isl_map_free(test);
	isl_map_free(schedule_deps);

	return is_parallel;
}

/* Mark a for node openmp parallel, if it is the outermost parallel for node.
 */
static void mark_openmp_parallel(__isl_keep isl_ast_build *build,
	struct ast_build_userinfo *build_info,
	struct ast_node_userinfo *node_info)
{
	if (build_info->in_parallel_for)
		return;

	if (ast_schedule_dim_is_parallel(build, build_info)) {
		build_info->in_parallel_for = 1;
		node_info->is_openmp = 1;
	}
}

/* Allocate an ast_node_info structure and initialize it with default values.
 */
static struct ast_node_userinfo *allocate_ast_node_userinfo()
{
	struct ast_node_userinfo *node_info;
	node_info = (struct ast_node_userinfo *)
		malloc(sizeof(struct ast_node_userinfo));
	node_info->is_openmp = 0;
	return node_info;
}

/* Free an ast_node_info structure.
 */
static void free_ast_node_userinfo(void *ptr)
{
	struct ast_node_userinfo *info;
	info = (struct ast_node_userinfo *) ptr;
	free(info);
}

/* This method is executed before the construction of a for node. It creates
 * an isl_id that is used to annotate the subsequently generated ast for nodes.
 *
 * In this function we also run the following analyses:
 *
 * 	- Detection of openmp parallel loops
 */
static __isl_give isl_id *ast_build_before_for(
	__isl_keep isl_ast_build *build, void *user)
{
	isl_id *id;
	struct ast_build_userinfo *build_info;
	struct ast_node_userinfo *node_info;

	build_info = (struct ast_build_userinfo *) user;
	node_info = allocate_ast_node_userinfo();
	id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
	id = isl_id_set_free_user(id, free_ast_node_userinfo);

	mark_openmp_parallel(build, build_info, node_info);

	return id;
}

/* This method is executed after the construction of a for node.
 *
 * It performs the following actions:
 *
 * 	- Reset the 'in_parallel_for' flag, as soon as we leave a for node,
 * 	  that is marked as openmp parallel.
 *
 */
static __isl_give isl_ast_node *ast_build_after_for(
	__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
	void *user)
{
	isl_id *id;
	struct ast_build_userinfo *build_info;
	struct ast_node_userinfo *info;

	id = isl_ast_node_get_annotation(node);
	info = isl_id_get_user(id);

	if (info && info->is_openmp) {
		build_info = (struct ast_build_userinfo *) user;
		build_info->in_parallel_for = 0;
	}

	isl_id_free(id);

	return node;
}

/* Find the element in scop->stmts that has the given "id".
 */
static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
{
	int i;

	for (i = 0; i < scop->pet->n_stmt; ++i) {
		struct pet_stmt *stmt = scop->pet->stmts[i];
		isl_id *id_i;

		id_i = isl_set_get_tuple_id(stmt->domain);
		isl_id_free(id_i);

		if (id_i == id)
			return stmt;
	}

	isl_die(isl_id_get_ctx(id), isl_error_internal,
		"statement not found", return NULL);
}

/* Print a user statement in the generated AST.
 * The ppcg_stmt has been attached to the node in at_each_domain.
 */
static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	struct ppcg_stmt *stmt;
	isl_id *id;

	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);

	p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);

	isl_ast_print_options_free(print_options);

	return p;
}


/* Print a for loop node as an openmp parallel loop.
 *
 * To print an openmp parallel loop we print a normal for loop, but add
 * "#pragma openmp parallel for" in front.
 *
 * Variables that are declared within the body of this for loop are
 * automatically openmp 'private'. Iterators declared outside of the
 * for loop are automatically openmp 'shared'. As ppcg declares all iterators
 * at the position where they are assigned, there is no need to explicitly mark
 * variables. Their automatically assigned type is already correct.
 *
 * This function only generates valid OpenMP code, if the ast was generated
 * with the 'atomic-bounds' option enabled.
 *
 */
static __isl_give isl_printer *print_for_with_openmp(
	__isl_keep isl_ast_node *node, __isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "#pragma omp parallel for");
	p = isl_printer_end_line(p);

	p = isl_ast_node_for_print(node, p, print_options);

	return p;
}

/* Print a for node.
 *
 * Depending on how the node is annotated, we either print a normal
 * for node or an openmp parallel for node.
 */
static __isl_give isl_printer *print_for(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	int openmp;

	openmp = 0;
	id = isl_ast_node_get_annotation(node);

	if (id) {
		struct ast_node_userinfo *info;

		info = (struct ast_node_userinfo *) isl_id_get_user(id);
		if (info && info->is_openmp)
			openmp = 1;
	}

	if (openmp)
		p = print_for_with_openmp(node, p, print_options);
	else
		p = isl_ast_node_for_print(node, p, print_options);

	isl_id_free(id);

	return p;
}

/* Index transformation callback for pet_stmt_build_ast_exprs.
 *
 * "index" expresses the array indices in terms of statement iterators
 * "iterator_map" expresses the statement iterators in terms of
 * AST loop iterators.
 *
 * The result expresses the array indices in terms of
 * AST loop iterators.
 */
static __isl_give isl_multi_pw_aff *pullback_index(
	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
{
	isl_pw_multi_aff *iterator_map = user;

	iterator_map = isl_pw_multi_aff_copy(iterator_map);
	return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
}

/* Transform the accesses in the statement associated to the domain
 * called by "node" to refer to the AST loop iterators, construct
 * corresponding AST expressions using "build",
 * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
 */
static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build, void *user)
{
	struct ppcg_scop *scop = user;
	isl_ast_expr *expr, *arg;
	isl_ctx *ctx;
	isl_id *id;
	isl_map *map;
	isl_pw_multi_aff *iterator_map;
	struct ppcg_stmt *stmt;

	ctx = isl_ast_node_get_ctx(node);
	stmt = isl_calloc_type(ctx, struct ppcg_stmt);
	if (!stmt)
		goto error;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	isl_ast_expr_free(expr);
	id = isl_ast_expr_get_id(arg);
	isl_ast_expr_free(arg);
	stmt->stmt = find_stmt(scop, id);
	isl_id_free(id);
	if (!stmt->stmt)
		goto error;

	map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
	map = isl_map_reverse(map);
	iterator_map = isl_pw_multi_aff_from_map(map);
	stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
				    &pullback_index, iterator_map, NULL, NULL);
	isl_pw_multi_aff_free(iterator_map);

	id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
	id = isl_id_set_free_user(id, &ppcg_stmt_free);
	return isl_ast_node_set_annotation(node, id);
error:
	ppcg_stmt_free(stmt);
	return isl_ast_node_free(node);
}

/* Set *depth (initialized to 0 by the caller) to the maximum
 * of the schedule depths of the leaf nodes for which this function is called.
 */
static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
{
	int *depth = user;
	int node_depth;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
		return isl_bool_true;
	node_depth = isl_schedule_node_get_schedule_depth(node);
	if (node_depth > *depth)
		*depth = node_depth;

	return isl_bool_false;
}

/* This function is called for each node in a CPU AST.
 * In case of a user node, print the macro definitions required
 * for printing the AST expressions in the annotation, if any.
 * For other nodes, return true such that descendants are also
 * visited.
 *
 * In particular, print the macro definitions needed for the substitutions
 * of the original user statements.
 */
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
{
	struct ppcg_stmt *stmt;
	isl_id *id;
	isl_printer **p = user;

	if (isl_ast_node_get_type(node) != isl_ast_node_user)
		return isl_bool_true;

	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);

	if (!stmt)
		return isl_bool_error;

	*p = ppcg_print_body_macros(*p, stmt->ref2expr);
	if (!*p)
		return isl_bool_error;

	return isl_bool_false;
}

/* Print the required macros for the CPU AST "node" to "p",
 * including those needed for the user statements inside the AST.
 */
static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node)
{
	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
		return isl_printer_free(p);
	p = ppcg_print_macros(p, node);
	return p;
}

/* Initialize the fields of "build_info".
 *
 * Initially, the AST generation is not inside any parallel for loop.
 *
 * The contraction of the entire schedule tree is extracted
 * right underneath the root node.
 */
static isl_stat init_build_info(struct ast_build_userinfo *build_info,
	struct ppcg_scop *scop, __isl_keep isl_schedule *schedule)
{
	isl_schedule_node *node = isl_schedule_get_root(schedule);
	node = isl_schedule_node_child(node, 0);

	build_info->scop = scop;
	build_info->in_parallel_for = 0;
	build_info->contraction =
		isl_schedule_node_get_subtree_contraction(node);

	isl_schedule_node_free(node);

	return isl_stat_non_null(build_info->contraction);
}

/* Clear all memory allocated by "build_info".
 */
static void clear_build_info(struct ast_build_userinfo *build_info)
{
	isl_union_pw_multi_aff_free(build_info->contraction);
}

/* Code generate the scop 'scop' using "schedule"
 * and print the corresponding C code to 'p'.
 */
static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
	__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
	struct ppcg_options *options)
{
	isl_ctx *ctx = isl_printer_get_ctx(p);
	isl_ast_build *build;
	isl_ast_print_options *print_options;
	isl_ast_node *tree;
	isl_id_list *iterators;
	struct ast_build_userinfo build_info;
	int depth;

	depth = 0;
	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
						&depth) < 0)
		goto error;

	build = isl_ast_build_alloc(ctx);
	iterators = ppcg_scop_generate_names(scop, depth, "c");
	build = isl_ast_build_set_iterators(build, iterators);
	build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);

	if (options->openmp) {
		if (init_build_info(&build_info, scop, schedule) < 0)
			build = isl_ast_build_free(build);

		build = isl_ast_build_set_before_each_for(build,
							&ast_build_before_for,
							&build_info);
		build = isl_ast_build_set_after_each_for(build,
							&ast_build_after_for,
							&build_info);
	}

	tree = isl_ast_build_node_from_schedule(build, schedule);
	isl_ast_build_free(build);

	if (options->openmp)
		clear_build_info(&build_info);

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
							&print_user, NULL);

	print_options = isl_ast_print_options_set_print_for(print_options,
							&print_for, NULL);

	p = cpu_print_macros(p, tree);
	p = isl_ast_node_print(tree, p, print_options);

	isl_ast_node_free(tree);

	return p;
error:
	isl_schedule_free(schedule);
	isl_printer_free(p);
	return NULL;
}

/* Tile the band node "node" with tile sizes "sizes" and
 * mark all members of the resulting tile node as "atomic".
 */
static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
	__isl_take isl_multi_val *sizes)
{
	node = isl_schedule_node_band_tile(node, sizes);
	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);

	return node;
}

/* Tile "node", if it is a band node with at least 2 members.
 * The tile sizes are set from the "tile_size" option.
 */
static __isl_give isl_schedule_node *tile_band(
	__isl_take isl_schedule_node *node, void *user)
{
	struct ppcg_scop *scop = user;
	int n;
	isl_space *space;
	isl_multi_val *sizes;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		return node;

	n = isl_schedule_node_band_n_member(node);
	if (n <= 1)
		return node;

	space = isl_schedule_node_band_get_space(node);
	sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);

	return tile(node, sizes);
}

/* Construct schedule constraints from the dependences in ps
 * for the purpose of computing a schedule for a CPU.
 *
 * The proximity constraints are set to the flow dependences.
 *
 * If live-range reordering is allowed then the conditional validity
 * constraints are set to the order dependences with the flow dependences
 * as condition.  That is, a live-range (flow dependence) will be either
 * local to an iteration of a band or all adjacent order dependences
 * will be respected by the band.
 * The validity constraints are set to the union of the flow dependences
 * and the forced dependences, while the coincidence constraints
 * are set to the union of the flow dependences, the forced dependences and
 * the order dependences.
 *
 * If live-range reordering is not allowed, then both the validity
 * and the coincidence constraints are set to the union of the flow
 * dependences and the false dependences.
 *
 * Note that the coincidence constraints are only set when the "openmp"
 * options is set.  Even though the way openmp pragmas are introduced
 * does not rely on the coincident property of the schedule band members,
 * the coincidence constraints do affect the way the schedule is constructed,
 * such that more schedule dimensions should be detected as parallel
 * by ast_schedule_dim_is_parallel.
 * Since the order dependences are also taken into account by
 * ast_schedule_dim_is_parallel, they are also added to
 * the coincidence constraints.  If the openmp handling learns
 * how to privatize some memory, then the corresponding order
 * dependences can be removed from the coincidence constraints.
 */
static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
	struct ppcg_scop *ps)
{
	isl_schedule_constraints *sc;
	isl_union_map *validity, *coincidence;

	sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
	if (ps->options->live_range_reordering) {
		sc = isl_schedule_constraints_set_conditional_validity(sc,
				isl_union_map_copy(ps->tagged_dep_flow),
				isl_union_map_copy(ps->tagged_dep_order));
		validity = isl_union_map_copy(ps->dep_flow);
		validity = isl_union_map_union(validity,
				isl_union_map_copy(ps->dep_forced));
		if (ps->options->openmp) {
			coincidence = isl_union_map_copy(validity);
			coincidence = isl_union_map_union(coincidence,
					isl_union_map_copy(ps->dep_order));
		}
	} else {
		validity = isl_union_map_copy(ps->dep_flow);
		validity = isl_union_map_union(validity,
				isl_union_map_copy(ps->dep_false));
		if (ps->options->openmp)
			coincidence = isl_union_map_copy(validity);
	}
	if (ps->options->openmp)
		sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
	sc = isl_schedule_constraints_set_validity(sc, validity);
	sc = isl_schedule_constraints_set_proximity(sc,
					isl_union_map_copy(ps->dep_flow));

	return sc;
}

/* Compute a schedule for the scop "ps".
 *
 * First derive the appropriate schedule constraints from the dependences
 * in "ps" and then compute a schedule from those schedule constraints,
 * possibly grouping statement instances based on the input schedule.
 */
static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
{
	isl_schedule_constraints *sc;
	isl_schedule *schedule;

	if (!ps)
		return NULL;

	sc = construct_cpu_schedule_constraints(ps);

	schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);

	return schedule;
}

/* Compute a new schedule to the scop "ps" if the reschedule option is set.
 * Otherwise, return a copy of the original schedule.
 */
static __isl_give isl_schedule *optionally_compute_schedule(void *user)
{
	struct ppcg_scop *ps = user;

	if (!ps)
		return NULL;
	if (!ps->options->reschedule)
		return isl_schedule_copy(ps->schedule);
	return compute_cpu_schedule(ps);
}

/* Compute a schedule based on the dependences in "ps" and
 * tile it if requested by the user.
 */
static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
	struct ppcg_options *options)
{
	isl_ctx *ctx;
	isl_schedule *schedule;

	if (!ps)
		return NULL;

	ctx = isl_union_set_get_ctx(ps->domain);
	schedule = ppcg_get_schedule(ctx, options,
				    &optionally_compute_schedule, ps);
	if (ps->options->tile)
		schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
							&tile_band, ps);

	return schedule;
}

/* Generate CPU code for the scop "ps" using "schedule" and
 * print the corresponding C code to "p", including variable declarations.
 */
static __isl_give isl_printer *print_cpu_with_schedule(
	__isl_take isl_printer *p, struct ppcg_scop *ps,
	__isl_take isl_schedule *schedule, struct ppcg_options *options)
{
	int hidden;
	isl_set *context;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);

	p = ppcg_set_macro_names(p);
	p = ppcg_print_exposed_declarations(p, ps);
	hidden = ppcg_scop_any_hidden_declarations(ps);
	if (hidden) {
		p = ppcg_start_block(p);
		p = ppcg_print_hidden_declarations(p, ps);
	}

	context = isl_set_copy(ps->context);
	context = isl_set_from_params(context);
	schedule = isl_schedule_insert_context(schedule, context);
	if (options->debug->dump_final_schedule)
		isl_schedule_dump(schedule);
	p = print_scop(ps, schedule, p, options);
	if (hidden)
		p = ppcg_end_block(p);

	return p;
}

/* Generate CPU code for the scop "ps" and print the corresponding C code
 * to "p", including variable declarations.
 */
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
	struct ppcg_scop *ps, struct ppcg_options *options)
{
	isl_schedule *schedule;

	schedule = isl_schedule_copy(ps->schedule);
	return print_cpu_with_schedule(p, ps, schedule, options);
}

/* Generate CPU code for "scop" and print it to "p".
 *
 * First obtain a schedule for "scop" and then print code for "scop"
 * using that schedule.
 */
static __isl_give isl_printer *generate(__isl_take isl_printer *p,
	struct ppcg_scop *scop, struct ppcg_options *options)
{
	isl_schedule *schedule;

	schedule = get_schedule(scop, options);

	return print_cpu_with_schedule(p, scop, schedule, options);
}

/* Wrapper around generate for use as a ppcg_transform callback.
 */
static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
	struct ppcg_scop *scop, void *user)
{
	struct ppcg_options *options = user;

	return generate(p, scop, options);
}

/* Transform the code in the file called "input" by replacing
 * all scops by corresponding CPU code and write the results to a file
 * called "output".
 */
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
	const char *input, const char *output)
{
	FILE *output_file;
	int r;

	output_file = get_output_file(input, output);
	if (!output_file)
		return -1;

	r = ppcg_transform(ctx, input, output_file, options,
					&print_cpu_wrap, options);

	fclose(output_file);

	return r;
}


================================================
FILE: src/cpu.h
================================================
#ifndef _CPU_H
#define _CPU_H

#include <isl/ctx.h>

#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

	struct ppcg_options;

	__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
																		struct ppcg_scop *ps, struct ppcg_options *options);
	int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
									 const char *input, const char *output);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/examples/chemv.c
================================================
/*
 * Copyright 2014      ARM Ltd.
 *
 * Use of this software is governed by the MIT license
 */

#include <stdio.h>
#include <stdlib.h>

struct ComplexFloat
{
	float Re;
	float Im;
};

/* chemv - complex hermitian matrix-vector multiplication
 * The function body was taken from a VOBLA-generated BLAS library.
 */
void chemv(int n, float alpha_re, float alpha_im,
	int ldAT, struct ComplexFloat AT[restrict const static n][ldAT],
	int incX, struct ComplexFloat X[restrict const static n][incX],
	float beta_re, float beta_im,
	int incY, struct ComplexFloat Y[restrict const static n][incY])
{
#pragma scop
	for (int i0 = 0; i0 <= (n-1); i0 += 1) {
		float var5_Re;
		float var5_Im;
		var5_Re = ((Y[i0][0].Re*beta_re)-(Y[i0][0].Im*beta_im));
		var5_Im = ((Y[i0][0].Im*beta_re)+(Y[i0][0].Re*beta_im));
		Y[i0][0].Re = var5_Re;
		Y[i0][0].Im = var5_Im;
	}
	for (int i1 = 0; i1 <= ((n-1)+1)-1; i1 += 1) {
		float var2_Re;
		float var3_Im;
		float var2_Im;
		float var4_Im;
		float var4_Re;
		float var3_Re;
		var2_Re = (alpha_re*AT[i1][i1].Re);
		var2_Im = (alpha_im*AT[i1][i1].Re);
		var3_Re = ((var2_Re*X[i1][0].Re)-(var2_Im*X[i1][0].Im));
		var3_Im = ((var2_Im*X[i1][0].Re)+(var2_Re*X[i1][0].Im));
		var4_Re = (Y[i1][0].Re+var3_Re);
		var4_Im = (Y[i1][0].Im+var3_Im);
		Y[i1][0].Re = var4_Re;
		Y[i1][0].Im = var4_Im;
	}
	for (int i2 = 0; i2 <= ((n-1)-1); i2 += 1) {
		for (int i3 = 0; i3 <= (n-1)-(1+i2); i3 += 1) {
			float var99_Re;
			float var96_Re;
			float var98_Im;
			float var96_Im;
			float var94_Im;
			float var95_Im;
			float var94_Re;
			float var95_Re;
			float var97_Im;
			float var99_Im;
			float var97_Re;
			float var98_Re;
			var94_Re = ((alpha_re*AT[i2][((1+i2)+i3)].Re)-
				(alpha_im*(-AT[i2][((1+i2)+i3)].Im)));
			var94_Im = ((alpha_im*AT[i2][((1+i2)+i3)].Re)+
				(alpha_re*(-AT[i2][((1+i2)+i3)].Im)));
			var95_Re = ((var94_Re*X[((i3+i2)+1)][0].Re)-
				(var94_Im*X[((i3+i2)+1)][0].Im));
			var95_Im = ((var94_Im*X[((i3+i2)+1)][0].Re)+
				(var94_Re*X[((i3+i2)+1)][0].Im));
			var96_Re = (Y[i2][0].Re+var95_Re);
			var96_Im = (Y[i2][0].Im+var95_Im);
			Y[i2][0].Re = var96_Re;
			Y[i2][0].Im = var96_Im;
			var97_Re = ((alpha_re*AT[i2][((1+i2)+i3)].Re)-
				(alpha_im*AT[i2][((1+i2)+i3)].Im));
			var97_Im = ((alpha_im*AT[i2][((1+i2)+i3)].Re)+
				(alpha_re*AT[i2][((1+i2)+i3)].Im));
			var98_Re = ((var97_Re*X[i2][0].Re)-
				(var97_Im*X[i2][0].Im));
			var98_Im = ((var97_Im*X[i2][0].Re)+
				(var97_Re*X[i2][0].Im));
			var99_Re = (Y[((i3+i2)+1)][0].Re+var98_Re);
			var99_Im = (Y[((i3+i2)+1)][0].Im+var98_Im);
			Y[((i3+i2)+1)][0].Re = var99_Re;
			Y[((i3+i2)+1)][0].Im = var99_Im;
		}
	}
#pragma endscop
}

int main()
{
	const int n = 37;
	const int incX = 1;
	const int incY = 1;
	const int ldAT = n;
	struct ComplexFloat AT[n][ldAT];
	struct ComplexFloat X[n][incX];
	struct ComplexFloat Y[n][incY];

	for (int i = 0; i < n; i++) {
		X[i][0] = (struct ComplexFloat){i + 5, i * 2};
		Y[i][0] = (struct ComplexFloat){i * 3, i + 7};
		for (int j = 0; j < ldAT; j++) {
			AT[i][j] = (struct ComplexFloat){i + j, i + 3};
		}
	}

	chemv(n, 3.14f, 1.59f, ldAT, AT, incX, X, 2.71f, 8.28f, incY, Y);

	for (int i = 0; i < n; i++)
		printf("%0.2f %0.2f\n", Y[i][0].Re, Y[i][0].Im);

	return EXIT_SUCCESS;
}


================================================
FILE: src/get_submodules.sh
================================================
#!/bin/sh
git submodule init
git submodule update
(cd isl; git submodule init imath; git submodule update imath)


================================================
FILE: src/grouping.c
================================================
/*
 * Copyright 2016      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege.
 */

#include <isl/ctx.h>
#include <isl/id.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/aff.h>
#include <isl/set.h>
#include <isl/map.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>

#include "grouping.h"
#include "schedule.h"

/* Internal data structure for use during the detection of statements
 * that can be grouped.
 *
 * "sc" contains the original schedule constraints (not a copy).
 * The validity constraints of "sc" are adjusted based on the groups
 * found so far.
 * "dep" contains the intersection of the validity and the proximity
 * constraints in "sc".  It may be NULL if it has not been computed yet.
 * "group_id" is the identifier for the next group that is extracted.
 *
 * "domain" is the set of statement instances that belong to any of the groups.
 * "contraction" maps the elements of "domain" to the corresponding group
 * instances.
 * "schedule" schedules the statements in each group relatively to each other.
 * These last three fields are NULL if no groups have been found so far.
 */
struct ppcg_grouping {
	isl_schedule_constraints *sc;

	isl_union_map *dep;
	int group_id;

	isl_union_set *domain;
	isl_union_pw_multi_aff *contraction;
	isl_schedule *schedule;
};

/* Clear all memory allocated by "grouping".
 */
static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
{
	isl_union_map_free(grouping->dep);
	isl_union_set_free(grouping->domain);
	isl_union_pw_multi_aff_free(grouping->contraction);
	isl_schedule_free(grouping->schedule);
}

/* Compute the intersection of the proximity and validity dependences
 * in grouping->sc and store the result in grouping->dep, unless
 * this intersection has been computed before.
 */
static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
{
	isl_union_map *validity, *proximity;

	if (grouping->dep)
		return isl_stat_ok;

	validity = isl_schedule_constraints_get_validity(grouping->sc);
	proximity = isl_schedule_constraints_get_proximity(grouping->sc);
	grouping->dep = isl_union_map_intersect(validity, proximity);

	if (!grouping->dep)
		return isl_stat_error;

	return isl_stat_ok;
}

/* Information extracted from one or more consecutive leaves
 * in the input schedule.
 *
 * "list" contains the sets of statement instances in the leaves,
 * one element in the list for each original leaf.
 * "domain" contains the union of the sets in "list".
 * "prefix" contains the prefix schedule of these elements.
 */
struct ppcg_grouping_leaf {
	isl_union_set *domain;
	isl_union_set_list *list;
	isl_multi_union_pw_aff *prefix;
};

/* Free all memory allocated for "leaves".
 */
static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[n])
{
	int i;

	if (!leaves)
		return;

	for (i = 0; i < n; ++i) {
		isl_union_set_free(leaves[i].domain);
		isl_union_set_list_free(leaves[i].list);
		isl_multi_union_pw_aff_free(leaves[i].prefix);
	}

	free(leaves);
}

/* Short-hand for retrieving the prefix schedule at "node"
 * in the form of an isl_multi_union_pw_aff.
 */
static __isl_give isl_multi_union_pw_aff *get_prefix(
	__isl_keep isl_schedule_node *node)
{
	return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
}

/* Return an array of "n" elements with information extracted from
 * the "n" children of "node" starting at "first", all of which
 * are known to be filtered leaves.
 */
struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
	int first, int n)
{
	int i;
	isl_ctx *ctx;
	struct ppcg_grouping_leaf *leaves;

	if (!node)
		return NULL;

	ctx = isl_schedule_node_get_ctx(node);
	leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
	if (!leaves)
		return NULL;

	for (i = 0; i < n; ++i) {
		isl_schedule_node *child;
		isl_union_set *domain;

		child = isl_schedule_node_get_child(node, first + i);
		child = isl_schedule_node_child(child, 0);
		domain = isl_schedule_node_get_domain(child);
		leaves[i].domain = isl_union_set_copy(domain);
		leaves[i].list = isl_union_set_list_from_union_set(domain);
		leaves[i].prefix = get_prefix(child);
		isl_schedule_node_free(child);
	}

	return leaves;
}

/* Internal data structure used by merge_leaves.
 *
 * "src" and "dst" point to the two consecutive leaves that are
 * under investigation for being merged.
 * "merge" is initially set to 0 and is set to 1 as soon as
 * it turns out that it is useful to merge the two leaves.
 */
struct ppcg_merge_leaves_data {
	int merge;
	struct ppcg_grouping_leaf *src;
	struct ppcg_grouping_leaf *dst;
};

/* Given a relation "map" between instances of two statements A and B,
 * does it relate every instance of A (according to the domain of "src")
 * to every instance of B (according to the domain of "dst")?
 */
static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
{
	isl_space *space;
	isl_set *set1, *set2;
	isl_bool is_subset;

	space = isl_space_domain(isl_map_get_space(map));
	set1 = isl_union_set_extract_set(src->domain, space);
	set2 = isl_map_domain(isl_map_copy(map));
	is_subset = isl_set_is_subset(set1, set2);
	isl_set_free(set1);
	isl_set_free(set2);
	if (is_subset < 0 || !is_subset)
		return is_subset;

	space = isl_space_range(isl_map_get_space(map));
	set1 = isl_union_set_extract_set(dst->domain, space);
	set2 = isl_map_range(isl_map_copy(map));
	is_subset = isl_set_is_subset(set1, set2);
	isl_set_free(set1);
	isl_set_free(set2);

	return is_subset;
}

/* Given a relation "map" between instances of two statements A and B,
 * are pairs of related instances executed together in the input schedule?
 * That is, is each pair of instances assigned the same value
 * by the corresponding prefix schedules?
 *
 * In particular, select the subset of "map" that has pairs of elements
 * with the same value for the prefix schedules and then check
 * if "map" is still a subset of the result.
 */
static isl_bool matches_prefix(__isl_keep isl_map *map,
	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
{
	isl_union_map *umap, *equal;
	isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
	isl_bool is_subset;

	src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
	dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
	prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);

	umap = isl_union_map_from_map(isl_map_copy(map));
	equal = isl_union_map_copy(umap);
	equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);

	is_subset = isl_union_map_is_subset(umap, equal);

	isl_union_map_free(umap);
	isl_union_map_free(equal);

	return is_subset;
}

/* Given a set of validity and proximity schedule constraints "map"
 * between statements in consecutive leaves in a valid schedule,
 * should the two leaves be merged into one?
 *
 * In particular, the two are merged if the constraints form
 * a bijection between every instance of the first statement and
 * every instance of the second statement.  Moreover, each
 * pair of such dependent instances needs to be executed consecutively
 * in the input schedule.  That is, they need to be assigned
 * the same value by their prefix schedules.
 *
 * What this means is that for each instance of the first statement
 * there is exactly one instance of the second statement that
 * is executed immediately after the instance of the first statement and
 * that, moreover, both depends on this statement instance and
 * should be brought as close as possible to this statement instance.
 * In other words, it is both possible to execute the two instances
 * together (according to the input schedule) and desirable to do so
 * (according to the validity and proximity schedule constraints).
 */
static isl_stat check_merge(__isl_take isl_map *map, void *user)
{
	struct ppcg_merge_leaves_data *data = user;
	isl_bool ok;

	ok = covers_src_and_dst(map, data->src, data->dst);
	if (ok >= 0 && ok)
		ok = isl_map_is_bijective(map);
	if (ok >= 0 && ok)
		ok = matches_prefix(map, data->src, data->dst);

	isl_map_free(map);

	if (ok < 0)
		return isl_stat_error;
	if (!ok)
		return isl_stat_ok;

	data->merge = 1;
	return isl_stat_error;
}

/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
 */
static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[n], int pos)
{
	int i;

	leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
						leaves[pos + 1].domain);
	leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
						leaves[pos + 1].list);
	leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
				leaves[pos].prefix, leaves[pos + 1].prefix);
	for (i = pos + 1; i + 1 < n; ++i)
		leaves[i] = leaves[i + 1];
	leaves[n - 1].domain = NULL;
	leaves[n - 1].list = NULL;
	leaves[n - 1].prefix = NULL;

	if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
		return isl_stat_error;

	return isl_stat_ok;
}

/* Merge pairs of consecutive leaves in "leaves" taking into account
 * the intersection of validity and proximity schedule constraints "dep".
 *
 * If a leaf has been merged with the next leaf, then the combination
 * is checked again for merging with the next leaf.
 * That is, if the leaves are A, B and C, then B may not have been
 * merged with C, but after merging A and B, it could still be useful
 * to merge the combination AB with C.
 *
 * Two leaves A and B are merged if there are instances of at least
 * one pair of statements, one statement in A and one B, such that
 * the validity and proximity schedule constraints between them
 * make them suitable for merging according to check_merge.
 *
 * Return the final number of leaves in the sequence, or -1 on error.
 */
static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[n],
	__isl_keep isl_union_map *dep)
{
	int i;
	struct ppcg_merge_leaves_data data;

	for (i = n - 1; i >= 0; --i) {
		isl_union_map *dep_i;
		isl_stat ok;

		if (i + 1 >= n)
			continue;

		dep_i = isl_union_map_copy(dep);
		dep_i = isl_union_map_intersect_domain(dep_i,
				isl_union_set_copy(leaves[i].domain));
		dep_i = isl_union_map_intersect_range(dep_i,
				isl_union_set_copy(leaves[i + 1].domain));
		data.merge = 0;
		data.src = &leaves[i];
		data.dst = &leaves[i + 1];
		ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
		isl_union_map_free(dep_i);
		if (ok < 0 && !data.merge)
			return -1;
		if (!data.merge)
			continue;
		if (merge_pair(n, leaves, i) < 0)
			return -1;
		--n;
		++i;
	}

	return n;
}

/* Construct a schedule with "domain" as domain, that executes
 * the elements of "list" in order (as a sequence).
 */
static __isl_give isl_schedule *schedule_from_domain_and_list(
	__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
{
	isl_schedule *schedule;
	isl_schedule_node *node;

	schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_child(node, 0);
	list = isl_union_set_list_copy(list);
	node = isl_schedule_node_insert_sequence(node, list);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Construct a unique identifier for a group in "grouping".
 *
 * The name is of the form G_n, with n the first value starting at
 * grouping->group_id that does not result in an identifier
 * that is already in use in the domain of the original schedule
 * constraints.
 */
static isl_id *construct_group_id(struct ppcg_grouping *grouping,
	__isl_take isl_space *space)
{
	isl_ctx *ctx;
	isl_id *id;
	isl_bool empty;
	isl_union_set *domain;

	if (!space)
		return NULL;

	ctx = isl_space_get_ctx(space);
	domain = isl_schedule_constraints_get_domain(grouping->sc);

	do {
		char buffer[20];
		isl_id *id;
		isl_set *set;

		snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
		grouping->group_id++;
		id = isl_id_alloc(ctx, buffer, NULL);
		space = isl_space_set_tuple_id(space, isl_dim_set, id);
		set = isl_union_set_extract_set(domain, isl_space_copy(space));
		empty = isl_set_plain_is_empty(set);
		isl_set_free(set);
	} while (empty >= 0 && !empty);

	if (empty < 0)
		space = isl_space_free(space);

	id = isl_space_get_tuple_id(space, isl_dim_set);

	isl_space_free(space);
	isl_union_set_free(domain);

	return id;
}

/* Construct a contraction from "prefix" and "domain" for a new group
 * in "grouping".
 *
 * The values of the prefix schedule "prefix" are used as instances
 * of the new group.  The identifier of the group is constructed
 * in such a way that it does not conflict with those of earlier
 * groups nor with statements in the domain of the original
 * schedule constraints.
 * The isl_multi_union_pw_aff "prefix" then simply needs to be
 * converted to an isl_union_pw_multi_aff.  However, this is not
 * possible if "prefix" is zero-dimensional, so in this case,
 * a contraction is constructed from "domain" instead.
 */
static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
	struct ppcg_grouping *grouping,
	__isl_keep isl_multi_union_pw_aff *prefix,
	__isl_keep isl_union_set *domain)
{
	isl_id *id;
	isl_space *space;
	int dim;

	space = isl_multi_union_pw_aff_get_space(prefix);
	if (!space)
		return NULL;
	dim = isl_space_dim(space, isl_dim_set);
	id = construct_group_id(grouping, space);
	if (dim == 0) {
		isl_multi_val *mv;

		space = isl_multi_union_pw_aff_get_space(prefix);
		space = isl_space_set_tuple_id(space, isl_dim_set, id);
		mv = isl_multi_val_zero(space);
		domain = isl_union_set_copy(domain);
		return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
	}
	prefix = isl_multi_union_pw_aff_copy(prefix);
	prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
	return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
}

/* Remove the validity schedule constraints from "sc" between
 * statement instances that get contracted to the same group instance
 * by the contraction described by "prefix" and "domain".
 *
 * The values of the prefix schedule "prefix" are used as instances
 * of the new group.  This means that validity schedule constraints
 * between instances with the same prefix schedule value need to be removed.
 * If "prefix" is zero-dimensional, then it does not contain any
 * information about the domain.  Instead, those schedule constraints
 * are removed that connect pairs of instances in "domain".
 */
static __isl_give isl_schedule_constraints *remove_group_validity(
	__isl_take isl_schedule_constraints *sc,
	__isl_keep isl_multi_union_pw_aff *prefix,
	__isl_keep isl_union_set *domain)
{
	int n;
	isl_union_map *validity, *joined;

	validity = isl_schedule_constraints_get_validity(sc);
	joined = isl_union_map_copy(validity);
	n = isl_multi_union_pw_aff_dim(prefix, isl_dim_out);
	if (n == 0) {
		joined = isl_union_map_intersect_domain(joined,
						isl_union_set_copy(domain));
		joined = isl_union_map_intersect_range(joined,
						isl_union_set_copy(domain));
	} else {
		joined = isl_union_map_eq_at_multi_union_pw_aff(joined,
					isl_multi_union_pw_aff_copy(prefix));
	}
	validity = isl_union_map_subtract(validity, joined);
	sc = isl_schedule_constraints_set_validity(sc, validity);
	return sc;
}

/* Extend "grouping" with groups corresponding to merged
 * leaves in the list of potentially merged leaves "leaves".
 *
 * The "list" field of each element in "leaves" contains a list
 * of the instances sets of the original leaves that have been
 * merged into this element.  If at least two of the original leaves
 * have been merged into a given element, then add the corresponding
 * group to "grouping" and remove validity schedule constraints
 * between statement instances that get mapped to the same group instance.
 * In particular, the domain is extended with the statement instances
 * of the merged leaves, the contraction is extended with a mapping
 * of these statement instances to instances of a new group and
 * the schedule is extended with a schedule that executes
 * the statement instances according to the order of the leaves
 * in which they appear.
 * Since the instances of the groups should already be scheduled apart
 * in the schedule into which this schedule will be plugged in,
 * the schedules of the individual groups are combined independently
 * of each other (as a set).
 */
static isl_stat add_groups(struct ppcg_grouping *grouping,
	int n, struct ppcg_grouping_leaf leaves[n])
{
	int i;

	for (i = 0; i < n; ++i) {
		int n_leaf;
		isl_schedule *schedule;
		isl_union_set *domain;
		isl_union_pw_multi_aff *upma;

		n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
		if (n_leaf < 0)
			return isl_stat_error;
		if (n_leaf <= 1)
			continue;
		schedule = schedule_from_domain_and_list(leaves[i].domain,
							leaves[i].list);
		upma = group_contraction_from_prefix_and_domain(grouping,
					leaves[i].prefix, leaves[i].domain);
		grouping->sc = remove_group_validity(grouping->sc,
					leaves[i].prefix, leaves[i].domain);

		domain = isl_union_set_copy(leaves[i].domain);
		if (grouping->domain) {
			domain = isl_union_set_union(domain, grouping->domain);
			upma = isl_union_pw_multi_aff_union_add(upma,
						grouping->contraction);
			schedule = isl_schedule_set(schedule,
						grouping->schedule);
		}
		grouping->domain = domain;
		grouping->contraction = upma;
		grouping->schedule = schedule;

		if (!grouping->domain || !grouping->contraction ||
		    !grouping->schedule)
			return isl_stat_error;
	}

	return isl_stat_ok;
}

/* Look for any pairs of consecutive leaves among the "n" children of "node"
 * starting at "first" that should be merged together.
 * Store the results in "grouping".
 *
 * First make sure the intersection of validity and proximity
 * schedule constraints is available and extract the required
 * information from the "n" leaves.
 * Then try and merge consecutive leaves based on the validity
 * and proximity constraints.
 * If any pairs were successfully merged, then add groups
 * corresponding to the merged leaves to "grouping".
 */
static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
	int first, int n, struct ppcg_grouping *grouping)
{
	int n_merge;
	struct ppcg_grouping_leaf *leaves;

	if (ppcg_grouping_compute_dep(grouping) < 0)
		return isl_stat_error;

	leaves = extract_leaves(node, first, n);
	if (!leaves)
		return isl_stat_error;

	n_merge = merge_leaves(n, leaves, grouping->dep);
	if (n_merge >= 0 && n_merge < n &&
	    add_groups(grouping, n_merge, leaves) < 0)
		return isl_stat_error;

	ppcg_grouping_leaf_free(n, leaves);

	return isl_stat_ok;
}

/* If "node" is a sequence, then check if it has any consecutive
 * leaves that should be merged together and store the results
 * in "grouping".
 *
 * In particular, call group_subsequence on each consecutive
 * sequence of (filtered) leaves among the children of "node".
 */
static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
{
	int i, n, first;
	struct ppcg_grouping *grouping = user;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		return isl_bool_true;

	n = isl_schedule_node_n_children(node);
	if (n < 0)
		return isl_bool_error;

	first = -1;
	for (i = 0; i < n; ++i) {
		isl_schedule_node *child;
		enum isl_schedule_node_type type;

		child = isl_schedule_node_get_child(node, i);
		child = isl_schedule_node_child(child, 0);
		type = isl_schedule_node_get_type(child);
		isl_schedule_node_free(child);

		if (first >= 0 && type != isl_schedule_node_leaf) {
			if (group_subsequence(node, first, i - first,
						grouping) < 0)
				return isl_bool_error;
			first = -1;
		}
		if (first < 0 && type == isl_schedule_node_leaf)
			first = i;
	}
	if (first >= 0) {
		if (group_subsequence(node, first, n - first, grouping) < 0)
			return isl_bool_error;
	}

	return isl_bool_true;
}

/* Complete "grouping" to cover all statement instances in the domain
 * of grouping->sc.
 *
 * In particular, grouping->domain is set to the full set of statement
 * instances; group->contraction is extended with an identity
 * contraction on the additional instances and group->schedule
 * is extended with an independent schedule on those additional instances.
 * In the extension of group->contraction, the additional instances
 * are split into those belong to different statements and those
 * that belong to some of the same statements.  The first group
 * is replaced by its universe in order to simplify the contraction extension.
 */
static void complete_grouping(struct ppcg_grouping *grouping)
{
	isl_union_set *domain, *left, *overlap;
	isl_union_pw_multi_aff *upma;
	isl_schedule *schedule;

	domain = isl_schedule_constraints_get_domain(grouping->sc);
	left = isl_union_set_subtract(isl_union_set_copy(domain),
				    isl_union_set_copy(grouping->domain));
	schedule = isl_schedule_from_domain(isl_union_set_copy(left));
	schedule = isl_schedule_set(schedule, grouping->schedule);
	grouping->schedule = schedule;

	overlap = isl_union_set_universe(grouping->domain);
	grouping->domain = domain;
	overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
	left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
	left = isl_union_set_universe(left);
	left = isl_union_set_union(left, overlap);
	upma = isl_union_set_identity_union_pw_multi_aff(left);
	upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
	grouping->contraction = upma;
}

/* Report that the given grouping is used during scheduling
 * (if the verbose options is set).
 */
static void report_grouping(__isl_keep isl_union_pw_multi_aff *contraction,
	struct ppcg_options *options)
{
	isl_ctx *ctx;
	isl_printer *p;

	if (!options->debug->verbose)
		return;

	ctx = isl_union_pw_multi_aff_get_ctx(contraction);
	p = isl_printer_to_file(ctx, stdout);
	p = isl_printer_print_str(p, "Scheduling performed with grouping ");
	p = isl_printer_print_union_pw_multi_aff(p, contraction);
	p = isl_printer_print_str(p, " (use --no-group-chains to disable)");
	p = isl_printer_end_line(p);
	isl_printer_free(p);
}

/* Compute a schedule on the domain of "sc" that respects the schedule
 * constraints in "sc", after trying to combine groups of statements.
 *
 * "schedule" is a known correct schedule that is used while combining
 * groups of statements.
 * In particular, statements that are executed consecutively in a sequence
 * in this schedule and where all instances of the second depend on
 * the instance of the first that is executed in the same iteration
 * of outer band nodes are grouped together into a single statement.
 * The schedule constraints are then mapped to these groups of statements
 * and the resulting schedule is expanded again to refer to the original
 * statements.
 */
__isl_give isl_schedule *ppcg_compute_grouping_schedule(
	__isl_take isl_schedule_constraints *sc,
	__isl_keep isl_schedule *schedule, struct ppcg_options *options)
{
	struct ppcg_grouping grouping = { sc };
	isl_union_pw_multi_aff *contraction;
	isl_union_map *umap;
	isl_schedule *res, *expansion;

	grouping.group_id = 0;
	if (isl_schedule_foreach_schedule_node_top_down(schedule,
			&detect_groups, &grouping) < 0)
		goto error;
	if (!grouping.contraction) {
		ppcg_grouping_clear(&grouping);
		return ppcg_compute_non_grouping_schedule(grouping.sc, options);
	}
	complete_grouping(&grouping);
	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
	report_grouping(contraction, options);
	umap = isl_union_map_from_union_pw_multi_aff(contraction);

	sc = isl_schedule_constraints_apply(grouping.sc, umap);

	res = ppcg_compute_non_grouping_schedule(sc, options);

	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
	expansion = isl_schedule_copy(grouping.schedule);
	res = isl_schedule_expand(res, contraction, expansion);

	ppcg_grouping_clear(&grouping);
	return res;
error:
	ppcg_grouping_clear(&grouping);
	isl_schedule_constraints_free(sc);
	return NULL;
}


================================================
FILE: src/grouping.h
================================================
#ifndef PPCG_GROUPING_H

#include <isl/schedule.h>

#include "ppcg_options.h"

__isl_give isl_schedule *ppcg_compute_grouping_schedule(
		__isl_take isl_schedule_constraints *sc,
		__isl_keep isl_schedule *schedule, struct ppcg_options *options);

#endif


================================================
FILE: src/hybrid.c
================================================
/*
 * Copyright 2013      Ecole Normale Superieure
 * Copyright 2015      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <string.h>

#include <isl/space.h>
#include <isl/constraint.h>
#include <isl/val.h>
#include <isl/aff.h>
#include <isl/set.h>
#include <isl/map.h>
#include <isl/union_set.h>
#include <isl/union_map.h>

#include "hybrid.h"
#include "schedule.h"

/* The hybrid tiling implemented in this file is based on
 * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs".
 */

/* Bounds on relative dependence distances in input to hybrid tiling.
 * upper is an upper bound on the relative dependence distances
 * in the first space dimension
 * -lower is a lower bound on the relative dependence distances
 * in all space dimensions.
 *
 * In particular,
 *
 *	d_i >= -lower_i d_0
 * and
 *	d_1 <= upper d_0
 *
 * for each dependence distance vector d, where d_1 is the component
 * corresponding to the first space dimension.
 *
 * upper and lower are always non-negative.
 * Some of the values may be NaN if no bound could be found.
 */
struct ppcg_ht_bounds {
	isl_val *upper;
	isl_multi_val *lower;
};

/* Free "bounds" along with all its fields.
 */
__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
	__isl_take ppcg_ht_bounds *bounds)
{
	if (!bounds)
		return NULL;
	isl_val_free(bounds->upper);
	isl_multi_val_free(bounds->lower);
	free(bounds);

	return NULL;
}

/* Create a ppcg_ht_bounds object for a band living in "space".
 * The bounds are initialized to NaN.
 */
__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space)
{
	int i, n;
	isl_ctx *ctx;
	ppcg_ht_bounds *bounds;

	if (!space)
		return NULL;

	ctx = isl_space_get_ctx(space);
	bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds);
	if (!bounds)
		goto error;
	bounds->upper = isl_val_nan(ctx);
	bounds->lower = isl_multi_val_zero(space);
	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
	for (i = 0; i < n; ++i) {
		isl_val *v = isl_val_copy(bounds->upper);
		bounds->lower = isl_multi_val_set_val(bounds->lower, i, v);
	}

	if (!bounds->lower || !bounds->upper)
		return ppcg_ht_bounds_free(bounds);

	return bounds;
error:
	isl_space_free(space);
	return NULL;
}

void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds)
{
	if (!bounds)
		return;

	fprintf(stderr, "lower: ");
	isl_multi_val_dump(bounds->lower);
	fprintf(stderr, "upper: ");
	isl_val_dump(bounds->upper);
}

/* Return the upper bound on the relative dependence distances
 * in the first space dimension.
 */
__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds)
{
	if (!bounds)
		return NULL;
	return isl_val_copy(bounds->upper);
}

/* Replace the upper bound on the relative dependence distances
 * in the first space dimension by "upper".
 */
__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper(
	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper)
{
	if (!bounds || !upper)
		goto error;
	isl_val_free(bounds->upper);
	bounds->upper = upper;
	return bounds;
error:
	ppcg_ht_bounds_free(bounds);
	isl_val_free(upper);
	return NULL;
}

/* Return the lower bound on the relative dependence distances
 * in space dimension "pos".
 */
__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds,
	int pos)
{
	if (!bounds)
		return NULL;
	return isl_multi_val_get_val(bounds->lower, pos);
}

/* Replace the lower bound on the relative dependence distances
 * in space dimension "pos" by "lower".
 */
__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower(
	__isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower)
{
	if (!bounds || !lower)
		goto error;
	bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower);
	if (!bounds->lower)
		return ppcg_ht_bounds_free(bounds);
	return bounds;
error:
	ppcg_ht_bounds_free(bounds);
	isl_val_free(lower);
	return NULL;
}

/* Can the bounds on relative dependence distances recorded in "bounds"
 * be used to perform hybrid tiling?
 * In particular, have appropriate lower and upper bounds been found?
 * Any NaN indicates that no corresponding bound was found.
 */
isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds)
{
	isl_bool is_nan;
	int i, n;

	if (!bounds)
		return isl_bool_error;
	is_nan = isl_val_is_nan(bounds->upper);
	if (is_nan < 0)
		return isl_bool_error;
	if (is_nan)
		return isl_bool_false;

	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
	for (i = 0; i < n; ++i) {
		isl_val *v;

		v = isl_multi_val_get_val(bounds->lower, i);
		is_nan = isl_val_is_nan(v);
		if (is_nan < 0)
			return isl_bool_error;
		if (is_nan)
			return isl_bool_false;
		isl_val_free(v);
	}

	return isl_bool_true;
}

/* Structure that represents the basic hexagonal tiling,
 * along with information that is needed to perform the hybrid tiling.
 *
 * "bounds" are the bounds on the dependence distances that
 * define the hexagonal shape and the required skewing in the remaining
 * space dimensions.
 *
 * "input_node" points to the input pair of band nodes.
 * "input_schedule" is the partial schedule of this input pair of band nodes.
 * The space of this schedule is [P -> C], where P is the space
 * of the parent node and C is the space of the child node.
 *
 * "space_sizes" represent the total size of a tile for the space
 * dimensions, i.e., those corresponding to the child node.
 * The space of "space_sizes" is C.
 * If S_0 is the original tile size in the first space dimension,
 * then the first entry of "space_sizes" is equal to
 * W = 2*S_0 + floor(d_l h) + floor(d_u h).
 * The remaining entries are the same as in the original tile sizes.
 *
 * The basic hexagonal tiling "hex" is defined
 * in a "ts" (time-space) space and corresponds to the phase-1 tiles.
 * "time_tile" maps the "ts" space to outer time tile.
 * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile
 * size corresponding to the parent node.
 * "local_time" maps the "ts" space to the time dimension inside each tile.
 * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile
 * size corresponding to the parent node.
 * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t))
 * in the space dimension such that they align to a multiple of W.
 * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W,
 * with shift_s = S_0 + floor(d_u h).
 * "shift_phase" is the shift taken to go from phase 0 to phase 1
 * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s],
 * with shift_s = S_0 + floor(d_u h).
 *
 * "project_ts" projects the space of the input schedule to the ts-space.
 * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
 */
struct ppcg_ht_tiling {
	int ref;

	ppcg_ht_bounds *bounds;
	isl_schedule_node *input_node;
	isl_multi_union_pw_aff *input_schedule;

	isl_multi_val *space_sizes;

	isl_aff *time_tile;
	isl_aff *local_time;
	isl_aff *shift_space;
	isl_multi_aff *shift_phase;
	isl_set *hex;

	isl_multi_aff *project_ts;
};
typedef struct ppcg_ht_tiling ppcg_ht_tiling;

/* Return the space of the pair of band nodes that form the input
 * to the hybrid tiling.
 * In particular, return the space [P -> C], where P is the space
 * of the parent node and C is the space of the child node.
 */
__isl_give isl_space *ppcg_ht_tiling_get_input_space(
	__isl_keep ppcg_ht_tiling *tile)
{
	if (!tile)
		return NULL;

	return isl_multi_union_pw_aff_get_space(tile->input_schedule);
}

/* Remove a reference to "tile" and free "tile" along with all its fields
 * as soon as the reference count drops to zero.
 */
static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free(
	__isl_take ppcg_ht_tiling *tiling)
{
	if (!tiling)
		return NULL;
	if (--tiling->ref > 0)
		return NULL;

	ppcg_ht_bounds_free(tiling->bounds);
	isl_schedule_node_free(tiling->input_node);
	isl_multi_union_pw_aff_free(tiling->input_schedule);
	isl_multi_val_free(tiling->space_sizes);
	isl_aff_free(tiling->time_tile);
	isl_aff_free(tiling->local_time);
	isl_aff_free(tiling->shift_space);
	isl_multi_aff_free(tiling->shift_phase);
	isl_set_free(tiling->hex);
	isl_multi_aff_free(tiling->project_ts);
	free(tiling);

	return NULL;
}

/* Return a new reference to "tiling".
 */
__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy(
	__isl_keep ppcg_ht_tiling *tiling)
{
	if (!tiling)
		return NULL;

	tiling->ref++;
	return tiling;
}

/* Return the isl_ctx to which "tiling" belongs.
 */
isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling)
{
	if (!tiling)
		return NULL;

	return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule);
}

/* Representation of one of the two phases of hybrid tiling.
 *
 * "tiling" points to the shared tiling data.
 *
 * "time_tile", "local_time" and "shift_space" are equal to the corresponding
 * fields of "tiling", pulled back to the input space.
 * In case of phase 0, these expressions have also been moved
 * from phase 1 to phase 0.
 *
 * "domain" contains the hexagonal tiling of this phase.
 *
 * "space_shift" is the shift that should be added to the space band
 * in order to be able to apply rectangular tiling to the space.
 * For phase 1, it is equal to
 *
 *	[P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u]
 *
 * with shift_s = S_0 + floor(d_u h),
 * T equal to "time_tile" and u equal to "local_time".
 * For phase 0, it is equal to
 *
 *	[P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u]
 *
 * "space_tile" is the space tiling.  It is equal to
 *
 *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
 */
struct ppcg_ht_phase {
	ppcg_ht_tiling *tiling;

	isl_aff *time_tile;
	isl_aff *local_time;
	isl_aff *shift_space;
	isl_set *domain;

	isl_multi_aff *space_shift;
	isl_multi_aff *space_tile;
};

/* Free "phase" along with all its fields.
 */
static __isl_null ppcg_ht_phase *ppcg_ht_phase_free(
	__isl_take ppcg_ht_phase *phase)
{
	if (!phase)
		return NULL;

	ppcg_ht_tiling_free(phase->tiling);
	isl_aff_free(phase->time_tile);
	isl_aff_free(phase->local_time);
	isl_aff_free(phase->shift_space);
	isl_set_free(phase->domain);
	isl_multi_aff_free(phase->space_shift);
	isl_multi_aff_free(phase->space_tile);
	free(phase);

	return NULL;
}

/* Wrapper around ppcg_ht_phase_free for use as an argument
 * to isl_id_set_free_user.
 */
static void ppcg_ht_phase_free_wrap(void *user)
{
	ppcg_ht_phase *phase = user;

	ppcg_ht_phase_free(phase);
}

/* Return the domain of hybrid tiling phase "phase".
 */
static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase)
{
	if (!phase)
		return NULL;

	return isl_set_copy(phase->domain);
}

/* Return the space of the pair of band nodes that form the input
 * to the hybrid tiling of which "phase" is a phase.
 * In particular, return the space [P -> C], where P is the space
 * of the parent node and C is the space of the child node.
 */
static __isl_give isl_space *ppcg_ht_phase_get_input_space(
	__isl_keep ppcg_ht_phase *phase)
{
	if (!phase)
		return NULL;

	return ppcg_ht_tiling_get_input_space(phase->tiling);
}

/* Construct the lower left constraint of the hexagonal tile, i.e.,
 *
 *	du a - b <= (2h+1) du - duh
 *	-du a + b + (2h+1) du - duh >= 0
 *
 * where duh = floor(du * h).
 *
 * This constraint corresponds to (6) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls,
	__isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh)
{
	isl_val *v;
	isl_aff *aff;

	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
	v = isl_val_mul(v, isl_val_copy(du));
	v = isl_val_sub(v, isl_val_copy(duh));
	aff = isl_aff_val_on_domain(ls, v);
	v = isl_val_neg(isl_val_copy(du));
	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);

	return isl_inequality_from_aff(aff);
}

/* Construct the lower constraint of the hexagonal tile, i.e.,
 *
 *	a <= 2h+1
 *	-a + 2h+1 >= 0
 *
 * This constraint corresponds to (7) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls,
	__isl_keep isl_val *h)
{
	isl_val *v;
	isl_aff *aff;

	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
	aff = isl_aff_val_on_domain(ls, v);
	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1);

	return isl_inequality_from_aff(aff);
}

/* Construct the lower right constraint of the hexagonal tile, i.e.,
 *
 *	dl a + b <= (2h+1) dl + duh + (s0-1)
 *	-dl a - b + (2h+1) dl + duh + (s0-1) >= 0
 *
 * where duh = floor(du * h).
 *
 * This constraint corresponds to (8) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_lower_right(
	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
	__isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh)
{
	isl_val *v;
	isl_aff *aff;

	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
	v = isl_val_mul(v, isl_val_copy(dl));
	v = isl_val_add(v, isl_val_copy(duh));
	v = isl_val_add(v, isl_val_copy(s0));
	v = isl_val_sub_ui(v, 1);
	aff = isl_aff_val_on_domain(ls, v);
	v = isl_val_neg(isl_val_copy(dl));
	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);

	return isl_inequality_from_aff(aff);
}

/* Construct the upper left constraint of the hexagonal tile, i.e.,
 *
 *	dl a + b >= h dl - (d - 1)/d				with d = den(dl)
 *	dl a + b - h dl + (d - 1)/d >= 0
 *
 * This constraint corresponds to (10) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls,
	__isl_keep isl_val *h, __isl_keep isl_val *dl)
{
	isl_val *v, *d;
	isl_aff *aff;

	d = isl_val_get_den_val(dl);
	v = isl_val_sub_ui(isl_val_copy(d), 1);
	v = isl_val_div(v, d);
	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl)));
	aff = isl_aff_val_on_domain(ls, v);
	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl));
	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);

	return isl_inequality_from_aff(aff);
}

/* Construct the upper right constraint of the hexagonal tile, i.e.,
 *
 *	du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d	with d = den(du)
 *	du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0
 *
 * where dlh = floor(dl * h) and duh = floor(du * h).
 *
 * This constraint corresponds to (12) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_upper_right(
	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
	__isl_keep isl_val *s0, __isl_keep isl_val *du,
	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
{
	isl_val *v, *d;
	isl_aff *aff;

	d = isl_val_get_den_val(du);
	v = isl_val_sub_ui(isl_val_copy(d), 1);
	v = isl_val_div(v, d);
	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du)));
	v = isl_val_add(v, isl_val_copy(duh));
	v = isl_val_add(v, isl_val_copy(dlh));
	v = isl_val_add(v, isl_val_copy(s0));
	v = isl_val_sub_ui(v, 1);
	aff = isl_aff_val_on_domain(ls, v);
	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du));
	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);

	return isl_inequality_from_aff(aff);
}

/* Construct the uppper constraint of the hexagonal tile, i.e.,
 *
 *	a >= 0
 *
 * This constraint corresponds to (13) in
 * "Hybrid Hexagonal/Classical Tiling for GPUs".
 */
static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls)
{
	isl_aff *aff;

	aff = isl_aff_var_on_domain(ls, isl_dim_set, 0);

	return isl_inequality_from_aff(aff);
}

/* Construct the basic hexagonal tile shape.
 * "space" is the 2D space in which the hexagon should be constructed.
 * h is st-1, with st the tile size in the time dimension
 * s0 is the tile size in the space dimension
 * dl is a bound on the negative relative dependence distances, i.e.,
 *
 *	d_s >= -dl d_t
 *
 * du is a bound on the positive relative dependence distances, i.e.,
 *
 *	d_s <= du d_t
 *
 * with (d_t,d_s) any dependence distance vector.
 * dlh = floor(dl * h)
 * duh = floor(du * h)
 *
 * The shape of the hexagon is as follows:
 *
 *		0 dlh   dlh+s0-1
 *		   ______                __
 * 0		  /      \_             /
 *		 /         \_          /
 * h		/            \ ______ /
 * h+1		\_           //      \\_
 *		  \_        //         \\_
 * 2h+1		    \______//            \\
 *		0   duh   duh+s0-1
 *		             duh+s0-1+dlh
 *		                  duh+s0-1+dlh+1+s0+1
 *
 * The next hexagon is shifted by duh + dlh + 2 * s0.
 *
 * The slope of the "/" constraints is dl.
 * The slope of the "\_" constraints is du.
 */
static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space,
	__isl_keep isl_val *h, __isl_keep isl_val *s0,
	__isl_keep isl_val *dl, __isl_keep isl_val *du,
	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
{
	isl_local_space *ls;
	isl_constraint *c;
	isl_basic_set *bset;

	ls = isl_local_space_from_space(space);

	c = hex_lower_left(isl_local_space_copy(ls), h, du, duh);
	bset = isl_basic_set_from_constraint(c);

	c = hex_lower(isl_local_space_copy(ls), h);
	bset = isl_basic_set_add_constraint(bset, c);

	c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh);
	bset = isl_basic_set_add_constraint(bset, c);

	c = hex_upper_left(isl_local_space_copy(ls), h, dl);
	bset = isl_basic_set_add_constraint(bset, c);

	c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh);
	bset = isl_basic_set_add_constraint(bset, c);

	c = hex_upper(ls);
	bset = isl_basic_set_add_constraint(bset, c);

	return isl_set_from_basic_set(bset);
}

/* Name of the ts-space.
 */
static const char *ts_space_name = "ts";

/* Construct and return the space ts[t, s].
 */
static __isl_give isl_space *construct_ts_space(isl_ctx *ctx)
{
	isl_space *s;

	s = isl_space_set_alloc(ctx, 0, 2);
	s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name);

	return s;
}

/* Name of the local ts-space.
 */
static const char *local_ts_space_name = "local_ts";

/* Construct and return the space local_ts[t, s].
 */
static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx)
{
	isl_space *s;

	s = isl_space_set_alloc(ctx, 0, 2);
	s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name);

	return s;
}

/* Compute the total size of a tile for the space dimensions,
 * i.e., those corresponding to the child node
 * of the input pattern.
 * If S_0 is the original tile size in the first space dimension,
 * then the first entry of "space_sizes" is equal to
 * W = 2*S_0 + floor(d_l h) + floor(d_u h).
 * The remaining entries are the same as in the original tile sizes.
 * "tile_sizes" contains the original tile sizes, including
 * the tile size corresponding to the parent node.
 * "dlh" is equal to floor(d_l h).
 * "duh" is equal to floor(d_u h).
 */
static __isl_give isl_multi_val *compute_space_sizes(
	__isl_keep isl_multi_val *tile_sizes,
	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
{
	isl_val *size;
	isl_multi_val *space_sizes;

	space_sizes = isl_multi_val_copy(tile_sizes);
	space_sizes = isl_multi_val_factor_range(space_sizes);
	size = isl_multi_val_get_val(space_sizes, 0);
	size = isl_val_mul_ui(size, 2);
	size = isl_val_add(size, isl_val_copy(duh));
	size = isl_val_add(size, isl_val_copy(dlh));
	space_sizes = isl_multi_val_set_val(space_sizes, 0, size);

	return space_sizes;
}

/* Compute the offset of phase 1 with respect to phase 0
 * in the ts-space ("space").
 * In particular, return
 *
 *	ts[st, s0 + duh]
 */
static __isl_give isl_multi_val *compute_phase_shift(
	__isl_keep isl_space *space, __isl_keep isl_val *st,
	__isl_keep isl_val *s0, __isl_keep isl_val *duh)
{
	isl_val *v;
	isl_multi_val *phase_shift;

	phase_shift = isl_multi_val_zero(isl_space_copy(space));
	phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st));
	v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0));
	phase_shift = isl_multi_val_set_val(phase_shift, 1, v);

	return phase_shift;
}

/* Return the function
 *
 *	ts[t, s] -> floor(t/(2 * st))
 *
 * representing the time tile.
 * "space" is the space ts[t, s].
 */
static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space,
	__isl_keep isl_val *st)
{
	isl_val *v;
	isl_aff *t;
	isl_local_space *ls;

	ls = isl_local_space_from_space(isl_space_copy(space));
	t = isl_aff_var_on_domain(ls, isl_dim_set, 0);
	v = isl_val_mul_ui(isl_val_copy(st), 2);
	t = isl_aff_floor(isl_aff_scale_down_val(t, v));

	return t;
}

/* Compute a shift in the space dimension for tiles
 * at time tile T = floor(t/(2 * S_t))
 * such that they align to a multiple of the total space tile dimension W.
 * In particular, compute
 *
 *	ts[t, s] -> s + (-(2 * shift_s)*T) % W
 *
 * where shift_s is the shift of phase 1 with respect to phase 0
 * in the space dimension (the first element of "phase_shift").
 * W is stored in the first element of "space_sizes".
 * "time_tile" is the function
 *
 *	ts[t, s] -> floor(t/(2 * S_T))
 *
 * Since phase 1 is shifted by shift_s with respect to phase 0,
 * the next line of phase 0 (at T+1) is shifted by 2*shift_s
 * with respect to the previous line (at T).
 * A shift of -(2 * shift_s)*T therefore allows the basic pattern
 * (which starts at 0) to be applied.
 * However, this shift will be used to obtain the tile coordinate
 * in the first space dimension and if the original values
 * in the space dimension are non-negative, then the shift should
 * not make them negative.  Moreover, the shift should be as minimal
 * as possible.
 * Since the pattern repeats itself with a period of W in the space
 * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W.
 */
static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile,
	__isl_keep isl_multi_val *space_sizes,
	__isl_keep isl_multi_val *phase_shift)
{
	isl_val *v;
	isl_aff *s, *t;
	isl_local_space *ls;

	ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile));
	t = isl_aff_copy(time_tile);
	v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2);
	v = isl_val_neg(v);
	t = isl_aff_scale_val(t, v);
	v = isl_multi_val_get_val(space_sizes, 0);
	t = isl_aff_mod_val(t, v);
	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
	s = isl_aff_add(s, t);

	return s;
}

/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)],
 * compute a function that applies the shift, i.e.,
 *
 *	ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)],
 */
static __isl_give isl_multi_aff *compute_shift_phase(
	__isl_keep isl_multi_val *phase_shift)
{
	isl_space *space;
	isl_multi_aff *shift;

	space = isl_multi_val_get_space(phase_shift);
	shift = isl_multi_aff_multi_val_on_space(space,
					isl_multi_val_copy(phase_shift));
	space = isl_multi_aff_get_space(shift);
	shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space));

	return shift;
}

/* Compute a mapping from the ts-space to the local coordinates
 * within each tile.  In particular, compute
 *
 *	ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W]
 *
 * "ts" is the space ts[t, s]
 * "local_ts" is the space local_ts[t, s]
 * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W
 * "st" is the tile size in the time dimension S_t.
 * The first element of "space_sizes" is equal to W.
 */
static __isl_give isl_multi_aff *compute_localize(
	__isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space,
	__isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes)
{
	isl_val *v;
	isl_space *space;
	isl_aff *s, *t;
	isl_multi_aff *localize;

	space = isl_aff_get_domain_space(shift_space);
	local_ts = isl_space_copy(local_ts);
	space = isl_space_map_from_domain_and_range(space, local_ts);
	localize = isl_multi_aff_identity(space);
	t = isl_multi_aff_get_aff(localize, 0);
	v = isl_val_mul_ui(isl_val_copy(st), 2);
	t = isl_aff_mod_val(t, v);
	localize = isl_multi_aff_set_aff(localize, 0, t);
	s = isl_aff_copy(shift_space);
	v = isl_multi_val_get_val(space_sizes, 0);
	s = isl_aff_mod_val(s, v);
	localize = isl_multi_aff_set_aff(localize, 1, s);

	return localize;
}

/* Set the project_ts field of "tiling".
 *
 * This field projects the space of the input schedule to the ts-space.
 * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
 */
static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts(
	__isl_take ppcg_ht_tiling *tiling)
{
	int n;
	isl_space *space;
	isl_multi_aff *project;

	if (!tiling)
		return NULL;

	space = ppcg_ht_tiling_get_input_space(tiling);
	n = isl_space_dim(space, isl_dim_set);
	project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2);
	project = isl_multi_aff_set_tuple_name(project,
						isl_dim_out, ts_space_name);
	if (!project)
		return ppcg_ht_tiling_free(tiling);

	tiling->project_ts = project;

	return tiling;
}

/* Construct a hybrid tiling description from bounds on the dependence
 * distances "bounds".
 * "input_node" points to the original parent node.
 * "input_schedule" is the combined schedule of the parent and child
 * node in the input.
 * "tile_sizes" are the original, user specified tile sizes.
 */
static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling(
	__isl_take ppcg_ht_bounds *bounds,
	__isl_keep isl_schedule_node *input_node,
	__isl_keep isl_multi_union_pw_aff *input_schedule,
	__isl_keep isl_multi_val *tile_sizes)
{
	isl_ctx *ctx;
	ppcg_ht_tiling *tiling;
	isl_multi_val *space_sizes, *phase_shift;
	isl_aff *time_tile, *shift_space;
	isl_multi_aff *localize;
	isl_val *h, *duh, *dlh;
	isl_val *st, *s0, *du, *dl;
	isl_space *ts, *local_ts;

	if (!bounds || !input_node || !input_schedule || !tile_sizes)
		goto error;

	ctx = isl_multi_union_pw_aff_get_ctx(input_schedule);
	tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling);
	if (!tiling)
		goto error;
	tiling->ref = 1;

	st = isl_multi_val_get_val(tile_sizes, 0);
	h = isl_val_sub_ui(isl_val_copy(st), 1);
	s0 = isl_multi_val_get_val(tile_sizes, 1);
	du = ppcg_ht_bounds_get_upper(bounds);
	dl = ppcg_ht_bounds_get_lower(bounds, 0);

	duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h)));
	dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h)));

	ts = construct_ts_space(ctx);
	local_ts = construct_local_ts_space(ctx);

	space_sizes = compute_space_sizes(tile_sizes, dlh, duh);
	phase_shift = compute_phase_shift(ts, st, s0, duh);
	time_tile = compute_time_tile(ts, st);
	shift_space = compute_shift_space(time_tile, space_sizes, phase_shift);
	localize = compute_localize(local_ts, shift_space, st, space_sizes);
	isl_space_free(ts);

	tiling->input_node = isl_schedule_node_copy(input_node);
	tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule);
	tiling->space_sizes = space_sizes;
	tiling->bounds = bounds;
	tiling->local_time = isl_multi_aff_get_aff(localize, 0);
	tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh);
	tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize);
	tiling->time_tile = time_tile;
	tiling->shift_space = shift_space;
	tiling->shift_phase = compute_shift_phase(phase_shift);
	isl_multi_val_free(phase_shift);

	isl_val_free(duh);
	isl_val_free(dlh);
	isl_val_free(du);
	isl_val_free(dl);
	isl_val_free(s0);
	isl_val_free(st);
	isl_val_free(h);

	if (!tiling->input_schedule || !tiling->local_time || !tiling->hex ||
	    !tiling->shift_space || !tiling->shift_phase)
		return ppcg_ht_tiling_free(tiling);

	tiling = ppcg_ht_tiling_set_project_ts(tiling);

	return tiling;
error:
	ppcg_ht_bounds_free(bounds);
	return NULL;
}

/* Are all members of the band node "node" coincident?
 */
static isl_bool all_coincident(__isl_keep isl_schedule_node *node)
{
	int i, n;

	n = isl_schedule_node_band_n_member(node);
	for (i = 0; i < n; ++i) {
		isl_bool c;

		c = isl_schedule_node_band_member_get_coincident(node, i);
		if (c < 0 || !c)
			return c;
	}

	return isl_bool_true;
}

/* Does "node" satisfy the properties of the inner node in the input
 * pattern for hybrid tiling?
 * That is, is it a band node with only coincident members, of which
 * there is at least one?
 */
static isl_bool has_child_properties(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		return isl_bool_false;
	if (isl_schedule_node_band_n_member(node) < 1)
		return isl_bool_false;
	return all_coincident(node);
}

/* Does "node" satisfy the properties of the outer node in the input
 * pattern for hybrid tiling?
 * That is, is it a band node with a single member?
 */
static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		return isl_bool_false;
	if (isl_schedule_node_band_n_member(node) != 1)
		return isl_bool_false;
	return isl_bool_true;
}

/* Does the parent of "node" satisfy the input patttern for hybrid tiling?
 * That is, does "node" satisfy the properties of the inner node and
 * does the parent of "node" satisfy the properties of the outer node?
 */
isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node)
{
	isl_bool has_pattern;

	has_pattern = has_child_properties(node);
	if (has_pattern < 0 || !has_pattern)
		return has_pattern;

	node = isl_schedule_node_copy(node);
	node = isl_schedule_node_parent(node);
	has_pattern = has_parent_properties(node);
	isl_schedule_node_free(node);

	return has_pattern;
}

/* Does "node" satisfy the input patttern for hybrid tiling?
 * That is, does "node" satisfy the properties of the outer node and
 * does the child of "node" satisfy the properties of the inner node?
 */
isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node)
{
	isl_bool has_pattern;

	has_pattern = has_parent_properties(node);
	if (has_pattern < 0 || !has_pattern)
		return has_pattern;

	node = isl_schedule_node_get_child(node, 0);
	has_pattern = has_child_properties(node);
	isl_schedule_node_free(node);

	return has_pattern;
}

/* Check that "node" satisfies the input pattern for hybrid tiling.
 * Error out if it does not.
 */
static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node)
{
	isl_bool has_pattern;

	has_pattern = ppcg_ht_has_input_pattern(node);
	if (has_pattern < 0)
		return isl_stat_error;
	if (!has_pattern)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
			"invalid input pattern for hybrid tiling",
			return isl_stat_error);

	return isl_stat_ok;
}

/* Extract the input schedule from "node", i.e., the product
 * of the partial schedules of the parent and child nodes
 * in the input pattern.
 */
static __isl_give isl_multi_union_pw_aff *extract_input_schedule(
	__isl_keep isl_schedule_node *node)
{
	isl_multi_union_pw_aff *partial, *partial2;

	partial = isl_schedule_node_band_get_partial_schedule(node);
	node = isl_schedule_node_get_child(node, 0);
	partial2 = isl_schedule_node_band_get_partial_schedule(node);
	isl_schedule_node_free(node);

	return isl_multi_union_pw_aff_range_product(partial, partial2);
}

/* Collect all dependences from "scop" that are relevant for performing
 * hybrid tiling on "node" and its child and map them to the schedule
 * space of this pair of nodes.
 *
 * In case live range reordering is not used,
 * the flow and the false dependences are collected.
 * In case live range reordering is used,
 * the flow and the forced dependences are collected, as well
 * as the order dependences that are adjacent to non-local
 * flow dependences.
 *
 * In all cases, only dependences that map to the same instance
 * of the outer part of the schedule are considered.
 */
static __isl_give isl_map *collect_deps(struct ppcg_scop *scop,
	__isl_keep isl_schedule_node *node)
{
	isl_space *space;
	isl_multi_union_pw_aff *prefix, *partial;
	isl_union_map *flow, *other, *dep, *umap;
	isl_map *map;

	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
	partial = extract_input_schedule(node);
	space = isl_multi_union_pw_aff_get_space(partial);

	flow = isl_union_map_copy(scop->dep_flow);
	flow = isl_union_map_eq_at_multi_union_pw_aff(flow,
					isl_multi_union_pw_aff_copy(prefix));
	if (!scop->options->live_range_reordering) {
		other = isl_union_map_copy(scop->dep_false);
		other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix);
	} else {
		isl_union_map *local, *non_local, *order, *adj;
		isl_union_set *domain, *range;

		other = isl_union_map_copy(scop->dep_forced);
		other = isl_union_map_eq_at_multi_union_pw_aff(other,
					isl_multi_union_pw_aff_copy(prefix));
		local = isl_union_map_copy(flow);
		local = isl_union_map_eq_at_multi_union_pw_aff(local,
					isl_multi_union_pw_aff_copy(partial));
		non_local = isl_union_map_copy(flow);
		non_local = isl_union_map_subtract(non_local, local);

		order = isl_union_map_copy(scop->dep_order);
		order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix);
		adj = isl_union_map_copy(order);
		domain = isl_union_map_domain(isl_union_map_copy(non_local));
		domain = isl_union_set_coalesce(domain);
		adj = isl_union_map_intersect_range(adj, domain);
		other = isl_union_map_union(other, adj);

		adj = order;
		range = isl_union_map_range(non_local);
		range = isl_union_set_coalesce(range);
		adj = isl_union_map_intersect_domain(adj, range);
		other = isl_union_map_union(other, adj);
	}
	dep = isl_union_map_union(flow, other);

	umap = isl_union_map_from_multi_union_pw_aff(partial);
	dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap));
	dep = isl_union_map_apply_range(dep, umap);

	space = isl_space_map_from_set(space);
	map = isl_union_map_extract_map(dep, space);
	isl_union_map_free(dep);

	map = isl_map_coalesce(map);

	return map;
}

/* Given a constraint of the form
 *
 *	a i_0 + b i_1 >= 0
 * or
 *	a i_0 + b i_1 = 0
 *
 * use it to update one or both of the non-negative bounds
 * in "list" = (min, max) such that
 *
 *	i_1 >= -min i_0
 * and
 *	i_1 <= max i_0
 *
 * If b = 0, then the constraint cannot be used.
 * Otherwise, the constraint is equivalent to
 *
 *	sgn(b) i_1 >= - a/abs(b) i_0
 * i.e.,
 *	i_1 >= - a/abs(b) i_0
 * or
 *	i_1 <= a/abs(b) i_0
 *
 * Set the first or second element of "list" to max(0, a/abs(b)),
 * according to the sign of "b".  Or set both in case the constraint
 * is an equality, taking into account the sign change.
 */
static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list,
	__isl_keep isl_constraint *c)
{
	isl_val *a, *b;
	int sign;
	int pos;
	isl_bool eq, is_zero, is_neg;

	eq = isl_constraint_is_equality(c);
	if (eq < 0)
		return isl_val_list_free(list);

	b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1);
	is_zero = isl_val_is_zero(b);
	if (is_zero == isl_bool_true) {
		isl_val_free(b);
		return list;
	}
	a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0);
	sign = isl_val_sgn(b);
	b = isl_val_abs(b);
	a = isl_val_div(a, b);

	if (eq)
		b = isl_val_copy(a);

	pos = sign > 0 ? 0 : 1;
	is_neg = isl_val_is_neg(a);
	if (is_neg == isl_bool_true)
		a = isl_val_set_si(a, 0);
	list = isl_val_list_set_val(list, pos, a);

	if (!eq)
		return is_neg < 0 ? isl_val_list_free(list) : list;

	pos = 1 - pos;
	a = isl_val_neg(b);
	is_neg = isl_val_is_neg(a);
	if (is_neg == isl_bool_true)
		a = isl_val_set_si(a, 0);
	list = isl_val_list_set_val(list, pos, a);

	return is_neg < 0 ? isl_val_list_free(list) : list;
}

/* If constraint "c" passes through the origin, then try and use it
 * to update the non-negative bounds in "list" = (min, max) such that
 *
 *	i_1 >= -min i_0
 * and
 *	i_1 <= max i_0
 */
static isl_stat set_min_max(__isl_take isl_constraint *c, void *user)
{
	isl_val *v;
	isl_val_list **list = user;
	isl_bool is_zero;

	v = isl_constraint_get_constant_val(c);
	is_zero = isl_val_is_zero(v);
	isl_val_free(v);

	if (is_zero == isl_bool_true)
		*list = list_set_min_max(*list, c);

	isl_constraint_free(c);
	return is_zero < 0 ? isl_stat_error : isl_stat_ok;
}

/* Given a set of dependence distance vectors "dist", compute
 * pair of non-negative bounds min and max such that
 *
 *	d_pos >= -min d_0
 * and
 *	d_pos <= max d_0
 *
 * and return the pair (min, max).
 * If no bound can be found in either direction, then the bound
 * is replaced by NaN.
 *
 * The dependence distances are first projected onto the (d_0, d_pos).
 * Then the zero dependence distance is added and the convex hull is computed.
 * Finally, the bounds are extracted from the constraints of the convex hull
 * that pass through the origin.
 */
static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos)
{
	isl_space *space;
	isl_basic_set *hull;
	int dim;
	isl_ctx *ctx;
	isl_val *nan;
	isl_val_list *list;

	ctx = isl_set_get_ctx(dist);
	nan = isl_val_nan(ctx);
	list = isl_val_list_alloc(ctx, 2);
	list = isl_val_list_add(list, isl_val_copy(nan));
	list = isl_val_list_add(list, nan);

	dist = isl_set_copy(dist);
	dim = isl_set_dim(dist, isl_dim_set);
	if (dist && pos >= dim)
		isl_die(ctx, isl_error_internal, "position out of bounds",
			dist = isl_set_free(dist));
	dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1));
	dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1);

	space = isl_set_get_space(dist);
	dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space)));
	dist = isl_set_remove_divs(dist);
	hull = isl_set_convex_hull(dist);

	if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0)
		list = isl_val_list_free(list);
	isl_basic_set_free(hull);

	return list;
}

/* Given a schedule node "node" that, together with its child,
 * satisfies the input pattern for hybrid tiling, compute bounds
 * on the relative dependence distances of the child node with
 * respect to the parent node.  These bounds are needed to
 * construct a hybrid tiling.
 *
 * First all relevant dependences are collected and mapped
 * to the schedule space of the pair of nodes.  Then, the
 * dependence distances are computed in this space.
 *
 * These dependence distances are then projected onto a two-dimensional
 * space consisting of the single schedule dimension of the outer node
 * and one of the schedule dimensions of the inner node.
 * The maximal and minimal relative dependence distances are extracted
 * from these projections.
 * This process is repeated for each of the schedule dimensions
 * of the inner node.  For the first dimension, both minimal and
 * maximal relative dependence distances are stored in the result.
 * For the other dimensions, only the minimal relative dependence
 * distance is stored.
 */
__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
	__isl_keep isl_schedule_node *node)
{
	ppcg_ht_bounds *bnd;
	isl_space *space;
	isl_map *map;
	isl_set *dist;
	isl_val_list *pair;
	isl_schedule_node *child;
	int n;
	int i, dim;

	if (!scop || !node || check_input_pattern(node) < 0)
		return NULL;

	child = isl_schedule_node_get_child(node, 0);
	space = isl_schedule_node_band_get_space(child);
	dim = isl_schedule_node_band_n_member(child);
	isl_schedule_node_free(child);
	bnd = ppcg_ht_bounds_alloc(space);
	if (!bnd)
		return NULL;

	map = collect_deps(scop, node);

	dist = isl_map_deltas(map);
	n = isl_set_dim(dist, isl_dim_param);
	dist = isl_set_project_out(dist, isl_dim_param, 0, n);

	pair = min_max_dist(dist, 1);
	bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0));
	bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1));
	isl_val_list_free(pair);

	for (i = 1; i < dim; ++i) {
		pair = min_max_dist(dist, 1 + i);
		bnd = ppcg_ht_bounds_set_lower(bnd, i,
						isl_val_list_get_val(pair, 0));
		isl_val_list_free(pair);
	}

	isl_set_free(dist);

	return bnd;
}

/* Check if all the fields of "phase" are valid, freeing "phase"
 * if they are not.
 */
static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase)
{
	if (!phase)
		return NULL;

	if (!phase->tiling || !phase->local_time ||
	    !phase->shift_space || !phase->domain)
		return ppcg_ht_phase_free(phase);

	return phase;
}

/* Construct a ppcg_ht_phase object, that simply copies
 * information from "tiling".
 * That is, the result is defined over the "ts" space and
 * corresponds to phase 1.
 */
static __isl_give ppcg_ht_phase *construct_phase(
	__isl_keep ppcg_ht_tiling *tiling)
{
	isl_ctx *ctx;
	ppcg_ht_phase *phase;

	if (!tiling)
		return NULL;

	ctx = ppcg_ht_tiling_get_ctx(tiling);
	phase = isl_calloc_type(ctx, struct ppcg_ht_phase);
	if (!phase)
		return NULL;
	phase->tiling = ppcg_ht_tiling_copy(tiling);
	phase->time_tile = isl_aff_copy(tiling->time_tile);
	phase->local_time = isl_aff_copy(tiling->local_time);
	phase->shift_space = isl_aff_copy(tiling->shift_space);
	phase->domain = isl_set_copy(tiling->hex);

	return check_phase(phase);
}

/* Align the parameters of the elements of "phase" to those of "space".
 */
static __isl_give ppcg_ht_phase *phase_align_params(
	__isl_take ppcg_ht_phase *phase, __isl_take isl_space *space)
{
	if (!phase)
		goto error;

	phase->time_tile = isl_aff_align_params(phase->time_tile,
							isl_space_copy(space));
	phase->local_time = isl_aff_align_params(phase->local_time,
							isl_space_copy(space));
	phase->shift_space = isl_aff_align_params(phase->shift_space,
							isl_space_copy(space));
	phase->domain = isl_set_align_params(phase->domain, space);

	return check_phase(phase);
error:
	isl_space_free(space);
	return NULL;
}

/* Pull back "phase" over "ma".
 * That is, take a phase defined over the range of "ma" and
 * turn it into a phase defined over the domain of "ma".
 */
static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase,
	__isl_take isl_multi_aff *ma)
{
	phase = phase_align_params(phase, isl_multi_aff_get_space(ma));
	if (!phase)
		goto error;

	phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile,
							isl_multi_aff_copy(ma));
	phase->local_time = isl_aff_pullback_multi_aff(phase->local_time,
							isl_multi_aff_copy(ma));
	phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space,
							isl_multi_aff_copy(ma));
	phase->domain = isl_set_preimage_multi_aff(phase->domain, ma);

	return check_phase(phase);
error:
	isl_multi_aff_free(ma);
	return NULL;
}

/* Pullback "phase" over phase->tiling->shift_phase, which shifts
 * phase 0 to phase 1.  The pullback therefore takes a phase 1
 * description and turns it into a phase 0 description.
 */
static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase)
{
	ppcg_ht_tiling *tiling;

	if (!phase)
		return NULL;

	tiling = phase->tiling;
	return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase));
}

/* Take a "phase" defined over the ts-space and plug in the projection
 * from the input schedule space to the ts-space.
 * The result is then defined over this input schedule space.
 */
static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase)
{
	ppcg_ht_tiling *tiling;

	if (!phase)
		return NULL;

	tiling = phase->tiling;
	return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts));
}

/* Compute the shift that should be added to the space band
 * in order to be able to apply rectangular tiling to the space.
 * Store the shift in phase->space_shift.
 *
 * In the first dimension, it is equal to shift_space - s.
 * For phase 1, this results in
 *
 *	(-(2 * shift_s)*T) % W
 *
 * In phase 0, the "s" in shift_space has been replaced by "s + shift_s",
 * so the result is
 *
 *	shift_s + (-(2 * shift_s)*T) % W
 *
 * In the other dimensions, the shift is equal to
 *
 *	dl_i * local_time.
 */
static __isl_give ppcg_ht_phase *compute_space_shift(
	__isl_take ppcg_ht_phase *phase)
{
	int i, n;
	isl_space *space;
	isl_local_space *ls;
	isl_aff *aff, *s;
	isl_multi_aff *space_shift;

	if (!phase)
		return NULL;

	space = ppcg_ht_phase_get_input_space(phase);
	space = isl_space_unwrap(space);
	space = isl_space_range_map(space);

	space_shift = isl_multi_aff_zero(space);
	aff = isl_aff_copy(phase->shift_space);
	ls = isl_local_space_from_space(isl_aff_get_domain_space(aff));
	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
	aff = isl_aff_sub(aff, s);
	space_shift = isl_multi_aff_set_aff(space_shift, 0, aff);

	n = isl_multi_aff_dim(space_shift, isl_dim_out);
	for (i = 1; i < n; ++i) {
		isl_val *v;
		isl_aff *time;

		v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i);
		time = isl_aff_copy(phase->local_time);
		time = isl_aff_scale_val(time, v);
		space_shift = isl_multi_aff_set_aff(space_shift, i, time);
	}

	if (!space_shift)
		return ppcg_ht_phase_free(phase);
	phase->space_shift = space_shift;
	return phase;
}

/* Compute the space tiling and store the result in phase->space_tile.
 * The space tiling is of the form
 *
 *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
 */
static __isl_give ppcg_ht_phase *compute_space_tile(
	__isl_take ppcg_ht_phase *phase)
{
	isl_space *space;
	isl_multi_val *space_sizes;
	isl_multi_aff *space_shift;
	isl_multi_aff *tile;

	if (!phase)
		return NULL;

	space = ppcg_ht_phase_get_input_space(phase);
	space = isl_space_unwrap(space);
	tile = isl_multi_aff_range_map(space);
	space_shift = isl_multi_aff_copy(phase->space_shift);
	tile = isl_multi_aff_add(space_shift, tile);
	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
	tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes);
	tile = isl_multi_aff_floor(tile);

	if (!tile)
		return ppcg_ht_phase_free(phase);
	phase->space_tile = tile;
	return phase;
}

/* Construct a representation for one of the two phase for hybrid tiling
 * "tiling".  If "shift" is not set, then the phase is constructed
 * directly from the hexagonal tile shape in "tiling", which represents
 * the phase-1 tiles.  If "shift" is set, then this tile shape is shifted
 * back over tiling->shift_phase to obtain the phase-0 tiles.
 *
 * First copy data from "tiling", then optionally shift the phase and
 * finally move the tiling from the "ts" space of "tiling" to
 * the space of the input pattern.
 *
 * After the basic phase has been computed, also compute
 * the corresponding space shift.
 */
static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase(
	__isl_keep ppcg_ht_tiling *tiling, int shift)
{
	ppcg_ht_phase *phase;

	phase = construct_phase(tiling);
	if (shift)
		phase = shift_phase(phase);
	phase = lift_phase(phase);

	phase = compute_space_shift(phase);
	phase = compute_space_tile(phase);

	return phase;
}

/* Consruct a function that is equal to the time tile of "phase0"
 * on the domain of "phase0" and equal to the time tile of "phase1"
 * on the domain of "phase1".
 * The two domains are assumed to form a partition of the input
 * schedule space.
 */
static __isl_give isl_pw_multi_aff *combine_time_tile(
	__isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1)
{
	isl_aff *T;
	isl_pw_aff *time, *time1;

	if (!phase0 || !phase1)
		return NULL;

	T = isl_aff_copy(phase0->time_tile);
	time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T);

	T = isl_aff_copy(phase1->time_tile);
	time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T);

	time = isl_pw_aff_union_add(time, time1);

	return isl_pw_multi_aff_from_pw_aff(time);
}

/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase.
 */
static char *ppcg_phase_name = "phase";

/* Does "id" contain a pointer to a ppcg_ht_phase?
 * That is, is it called "phase"?
 */
static isl_bool is_phase_id(__isl_keep isl_id *id)
{
	const char *name;

	name = isl_id_get_name(id);
	if (!name)
		return isl_bool_error;

	return !strcmp(name, ppcg_phase_name);
}

/* Given a mark node with an identifier that points to a ppcg_ht_phase,
 * extract this ppcg_ht_phase pointer.
 */
__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
	__isl_keep isl_schedule_node *node)
{
	isl_bool is_phase;
	isl_id *id;
	void *p;

	if (!node)
		return NULL;
	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
			"not a phase mark", return NULL);

	id = isl_schedule_node_mark_get_id(node);
	is_phase = is_phase_id(id);
	p = isl_id_get_user(id);
	isl_id_free(id);

	if (is_phase < 0)
		return NULL;
	if (!is_phase)
		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
			"not a phase mark", return NULL);

	return p;
}

/* Insert a mark node at "node" holding a pointer to "phase".
 */
static __isl_give isl_schedule_node *insert_phase(
	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase)
{
	isl_ctx *ctx;
	isl_id *id;

	if (!node)
		goto error;
	ctx = isl_schedule_node_get_ctx(node);
	id = isl_id_alloc(ctx, ppcg_phase_name, phase);
	if (!id)
		goto error;
	id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap);
	node = isl_schedule_node_insert_mark(node, id);

	return node;
error:
	ppcg_ht_phase_free(phase);
	isl_schedule_node_free(node);
	return NULL;
}

/* Construct a mapping from the elements of the original pair of bands
 * to which tiling was applied that belong to a tile of "phase"
 * to that tile, preserving the values for the outer bands.
 *
 * The mapping is of the form
 *
 *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
 *
 * where tile is defined by a concatenation of the time_tile and
 * the space_tile.
 */
static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase)
{
	int depth;
	isl_space *space;
	isl_multi_aff *ma;
	isl_multi_aff *tiling;
	isl_map *el2tile;

	depth = isl_schedule_node_get_schedule_depth(
						phase->tiling->input_node);
	space = isl_aff_get_space(phase->time_tile);
	space = isl_space_params(space);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, depth);
	space = isl_space_map_from_set(space);
	ma = isl_multi_aff_identity(space);

	tiling = isl_multi_aff_flat_range_product(
		isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)),
		isl_multi_aff_copy(phase->space_tile));
	el2tile = isl_map_from_multi_aff(tiling);
	el2tile = isl_map_intersect_domain(el2tile,
						isl_set_copy(phase->domain));
	el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile);

	return el2tile;
}

/* Return a description of the full tiles of "phase" at the point
 * in the original schedule tree where the tiling was applied.
 *
 * First construct a mapping from the input schedule dimensions
 * up to and including the original pair of bands to which hybrid tiling
 * was applied to schedule dimensions in which this original pair
 * has been replaced by the tiles.
 * This mapping is of the form
 *
 *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
 *
 * Apply this mapping to the set of all values for the input
 * schedule dimensions and then apply its inverse.
 * The result is the set of values for the input schedule dimensions
 * that would map to any of the tiles.  Subtracting from this set
 * the set of values that are actually executed produces the set
 * of values that belong to a tile but that are not executed.
 * Mapping these back to the tiles produces a description of
 * the partial tiles.  Subtracting these from the set of all tiles
 * produces a description of the full tiles in the form
 *
 *	[[outer] -> [tile]]
 */
static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase)
{
	isl_schedule_node *node;
	isl_union_set *domain;
	isl_union_map *prefix, *schedule;
	isl_set *all, *partial, *all_el;
	isl_map *tile2el, *el2tile;
	isl_multi_union_pw_aff *mupa;

	el2tile = construct_tile_map(phase);
	tile2el = isl_map_reverse(isl_map_copy(el2tile));

	node = phase->tiling->input_node;
	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
	domain = isl_schedule_node_get_domain(node);
	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
	schedule = isl_union_map_from_multi_union_pw_aff(mupa);
	schedule = isl_union_map_range_product(prefix, schedule);
	all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule));
	all_el = isl_set_coalesce(all_el);

	all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile));

	partial = isl_set_copy(all);
	partial = isl_set_apply(partial, tile2el);
	partial = isl_set_subtract(partial, all_el);
	partial = isl_set_apply(partial, el2tile);

	return isl_set_subtract(all, partial);
}

/* Copy the AST loop types of the non-isolated part to those
 * of the isolated part.
 */
static __isl_give isl_schedule_node *set_isolate_loop_type(
	__isl_take isl_schedule_node *node)
{
	int i, n;

	n = isl_schedule_node_band_n_member(node);
	for (i = 0; i < n; ++i) {
		enum isl_ast_loop_type type;

		type = isl_schedule_node_band_member_get_ast_loop_type(node, i);
		node = isl_schedule_node_band_member_set_isolate_ast_loop_type(
								node, i, type);
	}

	return node;
}

/* If options->isolate_full_tiles is set, then mark the full tiles
 * in "node" for isolation.  The full tiles are derived from "phase".
 * "node" may point to a part of the tiling, e.g., the space tiling.
 *
 * The full tiles are originally computed in the form
 *
 *	[[outer] -> [tile]]
 *
 * However, the band that "node" points to may only contain
 * subset of the tile dimensions.
 * The description above is therefore treated as
 *
 *	[[outer] -> [before; this; after]]
 *
 * before is of size "pos"; this is of size "dim"; and
 * after is of size "out - pos - dim".
 * The after part is first project out.  Then the range is split
 * into a before and this part and finally the before part is moved
 * to the domain, resulting in
 *
 *	[[outer; before] -> [this]]
 *
 * This description is then used as the isolate option.
 *
 * The AST loop type for the isolated part is set to be the same
 * as that of the non-isolated part.
 */
static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node(
	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
	struct ppcg_options *options)
{
	int in, out, pos, depth, dim;
	isl_space *space;
	isl_multi_aff *ma1, *ma2;
	isl_set *tile;
	isl_map *map;
	isl_set *set;
	isl_union_set *opt;

	if (!options->isolate_full_tiles)
		return node;

	depth = isl_schedule_node_get_schedule_depth(node);
	dim = isl_schedule_node_band_n_member(node);

	tile = compute_full_tile(phase);
	map = isl_set_unwrap(tile);
	in = isl_map_dim(map, isl_dim_in);
	out = isl_map_dim(map, isl_dim_out);
	pos = depth - in;
	map = isl_map_project_out(map, isl_dim_out, pos + dim,
				out - (pos + dim));
	space = isl_space_range(isl_map_get_space(map));
	ma1 = isl_multi_aff_project_out_map(isl_space_copy(space),
					   isl_dim_set, pos, dim);
	ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos);
	ma1 = isl_multi_aff_range_product(ma1, ma2);
	map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1));
	map = isl_map_uncurry(map);
	map = isl_map_flatten_domain(map);
	set = isl_map_wrap(map);
	set = isl_set_set_tuple_name(set, "isolate");

	opt = isl_schedule_node_band_get_ast_build_options(node);
	opt = isl_union_set_add_set(opt, set);
	node = isl_schedule_node_band_set_ast_build_options(node, opt);
	node = set_isolate_loop_type(node);

	return node;
}

/* Insert a band node for performing the space tiling for "phase" at "node".
 * In particular, insert a band node with partial schedule
 *
 *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)]
 *
 * pulled back over the input schedule.
 * "options" determines whether full tiles should be separated
 * from partial tiles.
 *
 * The first tile dimension iterates over the hexagons in the same
 * phase, which are independent by construction.  The first dimension
 * is therefore marked coincident.
 * All dimensions are also marked for being generated as atomic loops
 * because separation is usually not desirable on tile loops.
 */
static __isl_give isl_schedule_node *insert_space_tiling(
	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
	struct ppcg_options *options)
{
	isl_multi_aff *space_tile;
	isl_multi_union_pw_aff *mupa;

	if (!phase)
		return isl_schedule_node_free(node);

	space_tile = isl_multi_aff_copy(phase->space_tile);
	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile);
	node = isl_schedule_node_insert_partial_schedule(node, mupa);
	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
	node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options);
	node = isl_schedule_node_band_member_set_coincident(node, 0, 1);

	return node;
}

/* Given a pointer "node" to (a copy of) the original child node
 * in the input pattern, adjust its partial schedule such that
 * it starts at zero within each tile.
 *
 * That is, replace "s" by (s + space_shift) % space_sizes.
 */
__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node)
{
	isl_multi_val *space_sizes;
	isl_multi_aff *space_shift;
	isl_multi_union_pw_aff *mupa;

	space_shift = isl_multi_aff_copy(phase->space_shift);
	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift);
	node = isl_schedule_node_band_shift(node, mupa);
	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
	node = isl_schedule_node_band_mod(node, space_sizes);

	return node;
}

/* Does
 *
 *	s0 > delta + 2 * {delta * h} - 1
 *
 * hold?
 */
static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta,
	__isl_keep isl_val *h)
{
	isl_val *v, *v2;
	isl_bool ok;

	v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h));
	v2 = isl_val_floor(isl_val_copy(v));
	v = isl_val_sub(v, v2);
	v = isl_val_mul_ui(v, 2);
	v = isl_val_add(v, isl_val_copy(delta));
	v = isl_val_sub_ui(v, 1);
	ok = isl_val_gt(s0, v);
	isl_val_free(v);

	return ok;
}

/* Is the tile size specified by "sizes" wide enough in the first space
 * dimension, i.e., the base of the hexagon?  This ensures that,
 * after hybrid tiling using "bounds" and these sizes,
 * neighboring hexagons in the same phase are far enough apart
 * that they do not depend on each other.
 * The test is only meaningful if the bounds are valid.
 *
 * Let st be (half) the size in the time dimension and s0 the base
 * size in the first space dimension.  Let delta be the dependence
 * distance in either positive or negative direction.  In principle,
 * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta.
 * However, in case of fractional delta, the tile is not extended
 * with delta * (st - 1), but instead with floor(delta * (st - 1)).
 * The condition therefore needs to be adjusted to
 *
 *	s0 + 1 > delta + 2 {delta * (st - 1)}
 *
 * (with {} the fractional part) to account for the two slanted sides.
 * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs"
 * translates to
 *
 *	s0 >= delta + {delta * (st - 1)}
 *
 * Since 1 > frac(delta * (st - 1)), this condition implies
 * the condition above.
 *
 * The condition is checked for both directions.
 */
isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
	__isl_keep isl_multi_val *sizes)
{
	isl_val *s0, *h;
	isl_val *delta;
	isl_bool ok;

	ok = ppcg_ht_bounds_is_valid(bounds);
	if (ok < 0 || !ok)
		return ok;

	h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1);
	s0 = isl_multi_val_get_val(sizes, 1);

	delta = ppcg_ht_bounds_get_lower(bounds, 0);
	ok = wide_enough(s0, delta, h);
	isl_val_free(delta);

	delta = ppcg_ht_bounds_get_upper(bounds);
	if (ok == isl_bool_true)
		ok = wide_enough(s0, delta, h);
	isl_val_free(delta);

	isl_val_free(s0);
	isl_val_free(h);

	return ok;
}

/* Check that the tile will be wide enough in the first space
 * dimension, i.e., the base of the hexagon.  This ensures that
 * neighboring hexagons in the same phase are far enough apart
 * that they do not depend on each other.
 *
 * Error out if the condition fails to hold.
 */
static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds,
	__isl_keep isl_multi_val *sizes)
{
	isl_bool ok;

	ok = ppcg_ht_bounds_supports_sizes(bounds, sizes);

	if (ok < 0)
		return isl_stat_error;
	if (!ok)
		isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid,
			"base of hybrid tiling hexagon not sufficiently wide",
			return isl_stat_error);

	return isl_stat_ok;
}

/* Given valid bounds on the relative dependence distances for
 * the pair of nested nodes that "node" point to, as well as sufficiently
 * wide tile sizes "sizes", insert the corresponding time and space tiling
 * at "node", along with a pair of phase nodes that can be used
 * to make further changes.
 * The space of "sizes" should be the product of the spaces
 * of the schedules of the pair of parent and child nodes.
 * "options" determines whether full tiles should be separated
 * from partial tiles.
 *
 * In particular, given an input of the form
 *
 *	P - C - ...
 *
 * the output has the form
 *
 *	        /- F0 - M0 - CT0 - P - C - ...
 *	PT - seq
 *	        \- F1 - M1 - CT1 - P - C - ...
 *
 * PT is the global time tiling.  Within each of these tiles,
 * two phases are executed in order.  Within each phase, the schedule
 * space is further subdivided into tiles through CT0 and CT1.
 * The first dimension of each of these iterates over the hexagons
 * within a phase and these are independent by construction.
 * The F0 and F1 filters filter the statement instances that belong
 * to the corresponding phase.  The M0 and M1 marks contain a pointer
 * to a ppcg_ht_phase object that can be used to perform further changes.
 *
 * After checking that input satisfies the requirements,
 * a data structure is constructed that represents the tiling and
 * two additional data structures are constructed for the two phases
 * of the tiling.  These are then used to define the filters F0 and F1 and
 * combined to construct the time tiling PT.
 * Then the time tiling node PT is inserted, followed by
 * the sequence with the two filters, the CT space tiling nodes and
 * the phase markers M.
 */
__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
	__isl_take isl_schedule_node *node, struct ppcg_options *options)
{
	isl_ctx *ctx;
	isl_union_set *phase0;
	isl_union_set *phase1;
	isl_multi_union_pw_aff *input, *dom_time;
	isl_union_pw_multi_aff *upma;
	isl_pw_multi_aff *time;
	isl_union_set_list *phases;
	ppcg_ht_tiling *tiling;
	ppcg_ht_phase *phase_0;
	ppcg_ht_phase *phase_1;

	if (!node || !sizes || !bounds)
		goto error;
	if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0)
		goto error;

	ctx = isl_schedule_node_get_ctx(node);

	input = extract_input_schedule(node);

	tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes);
	phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1);
	phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0);
	time = combine_time_tile(phase_0, phase_1);
	ppcg_ht_tiling_free(tiling);

	upma = isl_union_pw_multi_aff_from_multi_union_pw_aff(
					isl_multi_union_pw_aff_copy(input));
	phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0));
	phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0,
					isl_union_pw_multi_aff_copy(upma));
	phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1));
	phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma);

	phases = isl_union_set_list_alloc(ctx, 2);
	phases = isl_union_set_list_add(phases, phase0);
	phases = isl_union_set_list_add(phases, phase1);

	dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time);
	node = isl_schedule_node_insert_partial_schedule(node, dom_time);

	node = isl_schedule_node_child(node, 0);

	node = isl_schedule_node_insert_sequence(node, phases);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	node = insert_space_tiling(phase_0, node, options);
	node = insert_phase(node, phase_0);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_next_sibling(node);
	node = isl_schedule_node_child(node, 0);
	node = insert_space_tiling(phase_1, node, options);
	node = insert_phase(node, phase_1);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);

	node = isl_schedule_node_parent(node);

	isl_multi_val_free(sizes);
	return node;
error:
	isl_multi_val_free(sizes);
	isl_schedule_node_free(node);
	ppcg_ht_bounds_free(bounds);
	return NULL;
}

/* Given a branch "node" that contains a sequence node with two phases
 * of hybrid tiling as input, call "fn" on each of the two phase marker
 * nodes.
 *
 * That is, the input is as follows
 *
 *	         /- F0 - M0 - ...
 *	... - seq
 *	         \- F1 - M1 - ...
 *
 * and "fn" is called on M0 and on M1.
 */
__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
	__isl_take isl_schedule_node *node,
	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
		void *user), void *user)
{
	int depth0, depth;

	depth0 = isl_schedule_node_get_tree_depth(node);

	while (node &&
	    isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		node = isl_schedule_node_child(node, 0);

	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	if (!node)
		return NULL;
	node = fn(node, user);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_next_sibling(node);
	node = isl_schedule_node_child(node, 0);
	if (!node)
		return NULL;
	node = fn(node, user);
	node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);

	depth = isl_schedule_node_get_tree_depth(node);
	node = isl_schedule_node_ancestor(node, depth - depth0);

	return node;
}

/* This function is called on each of the two phase marks
 * in a hybrid tiling tree.
 * Drop the phase mark at "node".
 */
static __isl_give isl_schedule_node *drop_phase_mark(
	__isl_take isl_schedule_node *node, void *user)
{
	isl_id *id;
	isl_bool is_phase;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
		return node;

	id = isl_schedule_node_mark_get_id(node);
	is_phase = is_phase_id(id);
	isl_id_free(id);

	if (is_phase < 0)
		return isl_schedule_node_free(node);
	if (is_phase)
		node = isl_schedule_node_delete(node);

	return node;
}

/* Given a branch "node" that contains a sequence node with two phases
 * of hybrid tiling as input, remove the two phase marker nodes.
 *
 * That is, the input is as follows
 *
 *	         /- F0 - M0 - ...
 *	... - seq
 *	         \- F1 - M1 - ...
 *
 * and the output is
 *
 *	         /- F0 - ...
 *	... - seq
 *	         \- F1 - ...
 */
__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
	__isl_take isl_schedule_node *node)
{
	return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL);
}


================================================
FILE: src/hybrid.h
================================================
#ifndef HYBRID_H
#define HYBRID_H

#include <isl/val.h>
#include <isl/schedule_node.h>

#include "ppcg.h"

struct ppcg_ht_bounds;
typedef struct ppcg_ht_bounds ppcg_ht_bounds;

struct ppcg_ht_phase;
typedef struct ppcg_ht_phase ppcg_ht_phase;

isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);

__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
																									__isl_keep isl_schedule_node *node);
void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
																			 __isl_keep isl_multi_val *sizes);
__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
		__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
		__isl_take isl_schedule_node *node, struct ppcg_options *options);
__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
		__isl_take ppcg_ht_bounds *bounds);

__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
		__isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
		__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
		__isl_take isl_schedule_node *node,
		__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
																				void *user),
		void *user);
__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
		__isl_take isl_schedule_node *node);

#endif


================================================
FILE: src/json.hpp
================================================
/*
    __ _____ _____ _____
 __|  |   __|     |   | |  JSON for Modern C++
|  |  |__   |  |  | | | |  version 3.9.1
|_____|_____|_____|_|___|  https://github.com/nlohmann/json

Licensed under the MIT License <http://opensource.org/licenses/MIT>.
SPDX-License-Identifier: MIT
Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.

Permission is hereby  granted, free of charge, to any  person obtaining a copy
of this software and associated  documentation files (the "Software"), to deal
in the Software  without restriction, including without  limitation the rights
to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#ifndef INCLUDE_NLOHMANN_JSON_HPP_
#define INCLUDE_NLOHMANN_JSON_HPP_

#define NLOHMANN_JSON_VERSION_MAJOR 3
#define NLOHMANN_JSON_VERSION_MINOR 9
#define NLOHMANN_JSON_VERSION_PATCH 1

#include <algorithm> // all_of, find, for_each
#include <cstddef> // nullptr_t, ptrdiff_t, size_t
#include <functional> // hash, less
#include <initializer_list> // initializer_list
#include <iosfwd> // istream, ostream
#include <iterator> // random_access_iterator_tag
#include <memory> // unique_ptr
#include <numeric> // accumulate
#include <string> // string, stoi, to_string
#include <utility> // declval, forward, move, pair, swap
#include <vector> // vector

// #include <nlohmann/adl_serializer.hpp>


#include <utility>

// #include <nlohmann/detail/conversions/from_json.hpp>


#include <algorithm> // transform
#include <array> // array
#include <forward_list> // forward_list
#include <iterator> // inserter, front_inserter, end
#include <map> // map
#include <string> // string
#include <tuple> // tuple, make_tuple
#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
#include <unordered_map> // unordered_map
#include <utility> // pair, declval
#include <valarray> // valarray

// #include <nlohmann/detail/exceptions.hpp>


#include <exception> // exception
#include <stdexcept> // runtime_error
#include <string> // to_string
#include <vector> // vector

// #include <nlohmann/detail/value_t.hpp>


#include <array> // array
#include <cstddef> // size_t
#include <cstdint> // uint8_t
#include <string> // string

namespace nlohmann
{
namespace detail
{
///////////////////////////
// JSON type enumeration //
///////////////////////////

/*!
@brief the JSON type enumeration

This enumeration collects the different JSON types. It is internally used to
distinguish the stored values, and the functions @ref basic_json::is_null(),
@ref basic_json::is_object(), @ref basic_json::is_array(),
@ref basic_json::is_string(), @ref basic_json::is_boolean(),
@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
@ref basic_json::is_structured() rely on it.

@note There are three enumeration entries (number_integer, number_unsigned, and
number_float), because the library distinguishes these three types for numbers:
@ref basic_json::number_unsigned_t is used for unsigned integers,
@ref basic_json::number_integer_t is used for signed integers, and
@ref basic_json::number_float_t is used for floating-point numbers or to
approximate integers which do not fit in the limits of their respective type.

@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
value with the default value for a given type

@since version 1.0.0
*/
enum class value_t : std::uint8_t
{
    null,             ///< null value
    object,           ///< object (unordered set of name/value pairs)
    array,            ///< array (ordered collection of values)
    string,           ///< string value
    boolean,          ///< boolean value
    number_integer,   ///< number value (signed integer)
    number_unsigned,  ///< number value (unsigned integer)
    number_float,     ///< number value (floating-point)
    binary,           ///< binary array (ordered collection of bytes)
    discarded         ///< discarded by the parser callback function
};

/*!
@brief comparison operator for JSON types

Returns an ordering that is similar to Python:
- order: null < boolean < number < object < array < string < binary
- furthermore, each type is not smaller than itself
- discarded values are not comparable
- binary is represented as a b"" string in python and directly comparable to a
  string; however, making a binary array directly comparable with a string would
  be surprising behavior in a JSON file.

@since version 1.0.0
*/
inline bool operator<(const value_t lhs, const value_t rhs) noexcept
{
    static constexpr std::array<std::uint8_t, 9> order = {{
            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
            6 /* binary */
        }
    };

    const auto l_index = static_cast<std::size_t>(lhs);
    const auto r_index = static_cast<std::size_t>(rhs);
    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
}
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/string_escape.hpp>


#include <string>
// #include <nlohmann/detail/macro_scope.hpp>


#include <utility> // pair
// #include <nlohmann/thirdparty/hedley/hedley.hpp>


/* Hedley - https://nemequ.github.io/hedley
 * Created by Evan Nemerson <evan@nemerson.com>
 *
 * To the extent possible under law, the author(s) have dedicated all
 * copyright and related and neighboring rights to this software to
 * the public domain worldwide. This software is distributed without
 * any warranty.
 *
 * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 * SPDX-License-Identifier: CC0-1.0
 */

#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
#if defined(JSON_HEDLEY_VERSION)
    #undef JSON_HEDLEY_VERSION
#endif
#define JSON_HEDLEY_VERSION 15

#if defined(JSON_HEDLEY_STRINGIFY_EX)
    #undef JSON_HEDLEY_STRINGIFY_EX
#endif
#define JSON_HEDLEY_STRINGIFY_EX(x) #x

#if defined(JSON_HEDLEY_STRINGIFY)
    #undef JSON_HEDLEY_STRINGIFY
#endif
#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)

#if defined(JSON_HEDLEY_CONCAT_EX)
    #undef JSON_HEDLEY_CONCAT_EX
#endif
#define JSON_HEDLEY_CONCAT_EX(a,b) a##b

#if defined(JSON_HEDLEY_CONCAT)
    #undef JSON_HEDLEY_CONCAT
#endif
#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)

#if defined(JSON_HEDLEY_CONCAT3_EX)
    #undef JSON_HEDLEY_CONCAT3_EX
#endif
#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c

#if defined(JSON_HEDLEY_CONCAT3)
    #undef JSON_HEDLEY_CONCAT3
#endif
#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)

#if defined(JSON_HEDLEY_VERSION_ENCODE)
    #undef JSON_HEDLEY_VERSION_ENCODE
#endif
#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))

#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
#endif
#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)

#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
#endif
#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)

#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
#endif
#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)

#if defined(JSON_HEDLEY_GNUC_VERSION)
    #undef JSON_HEDLEY_GNUC_VERSION
#endif
#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
#elif defined(__GNUC__)
    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
#endif

#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_GNUC_VERSION)
    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_MSVC_VERSION)
    #undef JSON_HEDLEY_MSVC_VERSION
#endif
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
#elif defined(_MSC_FULL_VER) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
#elif defined(_MSC_VER) && !defined(__ICL)
    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
#endif

#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
#endif
#if !defined(JSON_HEDLEY_MSVC_VERSION)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
#else
    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
#endif

#if defined(JSON_HEDLEY_INTEL_VERSION)
    #undef JSON_HEDLEY_INTEL_VERSION
#endif
#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
#elif defined(__INTEL_COMPILER) && !defined(__ICL)
    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
#endif

#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_INTEL_VERSION)
    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
    #undef JSON_HEDLEY_INTEL_CL_VERSION
#endif
#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
    #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
#endif

#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
    #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_PGI_VERSION)
    #undef JSON_HEDLEY_PGI_VERSION
#endif
#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
#endif

#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
    #undef JSON_HEDLEY_PGI_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_PGI_VERSION)
    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_SUNPRO_VERSION)
    #undef JSON_HEDLEY_SUNPRO_VERSION
#endif
#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
#elif defined(__SUNPRO_C)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
#elif defined(__SUNPRO_CC)
    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
#endif

#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_SUNPRO_VERSION)
    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
#endif
#if defined(__EMSCRIPTEN__)
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
#endif

#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_ARM_VERSION)
    #undef JSON_HEDLEY_ARM_VERSION
#endif
#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
#endif

#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
    #undef JSON_HEDLEY_ARM_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_ARM_VERSION)
    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_IBM_VERSION)
    #undef JSON_HEDLEY_IBM_VERSION
#endif
#if defined(__ibmxl__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
#elif defined(__xlC__) && defined(__xlC_ver__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
#elif defined(__xlC__)
    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
#endif

#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
    #undef JSON_HEDLEY_IBM_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_IBM_VERSION)
    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_VERSION)
    #undef JSON_HEDLEY_TI_VERSION
#endif
#if \
    defined(__TI_COMPILER_VERSION__) && \
    ( \
      defined(__TMS470__) || defined(__TI_ARM__) || \
      defined(__MSP430__) || \
      defined(__TMS320C2000__) \
    )
#if (__TI_COMPILER_VERSION__ >= 16000000)
    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif
#endif

#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_VERSION)
    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
    #undef JSON_HEDLEY_TI_CL2000_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL430_VERSION)
    #undef JSON_HEDLEY_TI_CL430_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL430_VERSION)
    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
    #undef JSON_HEDLEY_TI_ARMCL_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
    #undef JSON_HEDLEY_TI_CL6X_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
    #undef JSON_HEDLEY_TI_CL7X_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
    #undef JSON_HEDLEY_TI_CLPRU_VERSION
#endif
#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
#endif

#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_CRAY_VERSION)
    #undef JSON_HEDLEY_CRAY_VERSION
#endif
#if defined(_CRAYC)
    #if defined(_RELEASE_PATCHLEVEL)
        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
    #else
        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
    #endif
#endif

#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_CRAY_VERSION)
    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_IAR_VERSION)
    #undef JSON_HEDLEY_IAR_VERSION
#endif
#if defined(__IAR_SYSTEMS_ICC__)
    #if __VER__ > 1000
        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
    #else
        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
    #endif
#endif

#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
    #undef JSON_HEDLEY_IAR_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_IAR_VERSION)
    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_TINYC_VERSION)
    #undef JSON_HEDLEY_TINYC_VERSION
#endif
#if defined(__TINYC__)
    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
#endif

#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_TINYC_VERSION)
    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_DMC_VERSION)
    #undef JSON_HEDLEY_DMC_VERSION
#endif
#if defined(__DMC__)
    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
#endif

#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
    #undef JSON_HEDLEY_DMC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_DMC_VERSION)
    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_COMPCERT_VERSION)
    #undef JSON_HEDLEY_COMPCERT_VERSION
#endif
#if defined(__COMPCERT_VERSION__)
    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
#endif

#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_COMPCERT_VERSION)
    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_PELLES_VERSION)
    #undef JSON_HEDLEY_PELLES_VERSION
#endif
#if defined(__POCC__)
    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
#endif

#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_PELLES_VERSION)
    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #undef JSON_HEDLEY_MCST_LCC_VERSION
#endif
#if defined(__LCC__) && defined(__LCC_MINOR__)
    #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
#endif

#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
    #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_GCC_VERSION)
    #undef JSON_HEDLEY_GCC_VERSION
#endif
#if \
    defined(JSON_HEDLEY_GNUC_VERSION) && \
    !defined(__clang__) && \
    !defined(JSON_HEDLEY_INTEL_VERSION) && \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_ARM_VERSION) && \
    !defined(JSON_HEDLEY_CRAY_VERSION) && \
    !defined(JSON_HEDLEY_TI_VERSION) && \
    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
    !defined(__COMPCERT__) && \
    !defined(JSON_HEDLEY_MCST_LCC_VERSION)
    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
#endif

#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
    #undef JSON_HEDLEY_GCC_VERSION_CHECK
#endif
#if defined(JSON_HEDLEY_GCC_VERSION)
    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
#else
    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
#endif

#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_ATTRIBUTE
#endif
#if \
  defined(__has_attribute) && \
  ( \
    (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
  )
#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
#else
#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
#endif
#if defined(__has_attribute)
    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
#endif
#if defined(__has_attribute)
    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
#endif
#if \
    defined(__has_cpp_attribute) && \
    defined(__cplusplus) && \
    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
#endif
#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
#elif \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_IAR_VERSION) && \
    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
#else
    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
#endif
#if defined(__has_cpp_attribute) && defined(__cplusplus)
    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
#endif
#if defined(__has_cpp_attribute) && defined(__cplusplus)
    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_BUILTIN)
    #undef JSON_HEDLEY_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
#endif
#if defined(__has_builtin)
    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
#else
    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_FEATURE)
    #undef JSON_HEDLEY_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
#else
    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
#else
    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
    #undef JSON_HEDLEY_GCC_HAS_FEATURE
#endif
#if defined(__has_feature)
    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
#else
    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_EXTENSION)
    #undef JSON_HEDLEY_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
#else
    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
#else
    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
#endif
#if defined(__has_extension)
    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
#else
    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
#endif
#if defined(__has_declspec_attribute)
    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
#else
    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_HAS_WARNING)
    #undef JSON_HEDLEY_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
#else
    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
#endif

#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
    #undef JSON_HEDLEY_GNUC_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
#else
    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
    #undef JSON_HEDLEY_GCC_HAS_WARNING
#endif
#if defined(__has_warning)
    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
#else
    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if \
    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
    defined(__clang__) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
#else
    #define JSON_HEDLEY_PRAGMA(value)
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
#endif
#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
    #undef JSON_HEDLEY_DIAGNOSTIC_POP
#endif
#if defined(__clang__)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
    #define JSON_HEDLEY_DIAGNOSTIC_POP
#endif

/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
#endif
#if defined(__cplusplus)
#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
#      if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
    _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#      else
#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#      endif
#    else
#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
    xpr \
    JSON_HEDLEY_DIAGNOSTIC_POP
#    endif
#  endif
#endif
#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
#endif

#if defined(JSON_HEDLEY_CONST_CAST)
    #undef JSON_HEDLEY_CONST_CAST
#endif
#if defined(__cplusplus)
#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
#elif \
  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
        JSON_HEDLEY_DIAGNOSTIC_PUSH \
        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
        ((T) (expr)); \
        JSON_HEDLEY_DIAGNOSTIC_POP \
    }))
#else
#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_REINTERPRET_CAST)
    #undef JSON_HEDLEY_REINTERPRET_CAST
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
#else
    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_STATIC_CAST)
    #undef JSON_HEDLEY_STATIC_CAST
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
#else
    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
#endif

#if defined(JSON_HEDLEY_CPP_CAST)
    #undef JSON_HEDLEY_CPP_CAST
#endif
#if defined(__cplusplus)
#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
#    define JSON_HEDLEY_CPP_CAST(T, expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
    ((T) (expr)) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
#    define JSON_HEDLEY_CPP_CAST(T, expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("diag_suppress=Pe137") \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  else
#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
#  endif
#else
#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
#elif \
    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#endif

#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
#else
    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#endif

#if defined(JSON_HEDLEY_DEPRECATED)
    #undef JSON_HEDLEY_DEPRECATED
#endif
#if defined(JSON_HEDLEY_DEPRECATED_FOR)
    #undef JSON_HEDLEY_DEPRECATED_FOR
#endif
#if \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
#elif \
    (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
#elif defined(__cplusplus) && (__cplusplus >= 201402L)
    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
#else
    #define JSON_HEDLEY_DEPRECATED(since)
    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
#endif

#if defined(JSON_HEDLEY_UNAVAILABLE)
    #undef JSON_HEDLEY_UNAVAILABLE
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
#else
    #define JSON_HEDLEY_UNAVAILABLE(available_since)
#endif

#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
#endif
#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
#elif defined(_Check_return_) /* SAL */
    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
#else
    #define JSON_HEDLEY_WARN_UNUSED_RESULT
    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
#endif

#if defined(JSON_HEDLEY_SENTINEL)
    #undef JSON_HEDLEY_SENTINEL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
#else
    #define JSON_HEDLEY_SENTINEL(position)
#endif

#if defined(JSON_HEDLEY_NO_RETURN)
    #undef JSON_HEDLEY_NO_RETURN
#endif
#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_NO_RETURN __noreturn
#elif \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
    #define JSON_HEDLEY_NO_RETURN _Noreturn
#elif defined(__cplusplus) && (__cplusplus >= 201103L)
    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
#else
    #define JSON_HEDLEY_NO_RETURN
#endif

#if defined(JSON_HEDLEY_NO_ESCAPE)
    #undef JSON_HEDLEY_NO_ESCAPE
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
#else
    #define JSON_HEDLEY_NO_ESCAPE
#endif

#if defined(JSON_HEDLEY_UNREACHABLE)
    #undef JSON_HEDLEY_UNREACHABLE
#endif
#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
    #undef JSON_HEDLEY_UNREACHABLE_RETURN
#endif
#if defined(JSON_HEDLEY_ASSUME)
    #undef JSON_HEDLEY_ASSUME
#endif
#if \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
#elif \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
    #if defined(__cplusplus)
        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
    #else
        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
    #endif
#endif
#if \
    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
#elif defined(JSON_HEDLEY_ASSUME)
    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
#endif
#if !defined(JSON_HEDLEY_ASSUME)
    #if defined(JSON_HEDLEY_UNREACHABLE)
        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
    #else
        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
    #endif
#endif
#if defined(JSON_HEDLEY_UNREACHABLE)
    #if  \
        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
    #else
        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
    #endif
#else
    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
#endif
#if !defined(JSON_HEDLEY_UNREACHABLE)
    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
#endif

JSON_HEDLEY_DIAGNOSTIC_PUSH
#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
    #pragma clang diagnostic ignored "-Wpedantic"
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#endif
#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
    #if defined(__clang__)
        #pragma clang diagnostic ignored "-Wvariadic-macros"
    #elif defined(JSON_HEDLEY_GCC_VERSION)
        #pragma GCC diagnostic ignored "-Wvariadic-macros"
    #endif
#endif
#if defined(JSON_HEDLEY_NON_NULL)
    #undef JSON_HEDLEY_NON_NULL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
#else
    #define JSON_HEDLEY_NON_NULL(...)
#endif
JSON_HEDLEY_DIAGNOSTIC_POP

#if defined(JSON_HEDLEY_PRINTF_FORMAT)
    #undef JSON_HEDLEY_PRINTF_FORMAT
#endif
#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
#elif \
    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
#else
    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
#endif

#if defined(JSON_HEDLEY_CONSTEXPR)
    #undef JSON_HEDLEY_CONSTEXPR
#endif
#if defined(__cplusplus)
    #if __cplusplus >= 201103L
        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
    #endif
#endif
#if !defined(JSON_HEDLEY_CONSTEXPR)
    #define JSON_HEDLEY_CONSTEXPR
#endif

#if defined(JSON_HEDLEY_PREDICT)
    #undef JSON_HEDLEY_PREDICT
#endif
#if defined(JSON_HEDLEY_LIKELY)
    #undef JSON_HEDLEY_LIKELY
#endif
#if defined(JSON_HEDLEY_UNLIKELY)
    #undef JSON_HEDLEY_UNLIKELY
#endif
#if defined(JSON_HEDLEY_UNPREDICTABLE)
    #undef JSON_HEDLEY_UNPREDICTABLE
#endif
#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
#endif
#if \
  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
#elif \
  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
    (__extension__ ({ \
        double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
    }))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
    (__extension__ ({ \
        double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
    }))
#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
#else
#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
#endif
#if !defined(JSON_HEDLEY_UNPREDICTABLE)
    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
#endif

#if defined(JSON_HEDLEY_MALLOC)
    #undef JSON_HEDLEY_MALLOC
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_MALLOC __declspec(restrict)
#else
    #define JSON_HEDLEY_MALLOC
#endif

#if defined(JSON_HEDLEY_PURE)
    #undef JSON_HEDLEY_PURE
#endif
#if \
  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#  define JSON_HEDLEY_PURE __attribute__((__pure__))
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
#elif defined(__cplusplus) && \
    ( \
      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
    )
#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
#else
#  define JSON_HEDLEY_PURE
#endif

#if defined(JSON_HEDLEY_CONST)
    #undef JSON_HEDLEY_CONST
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_CONST __attribute__((__const__))
#elif \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
#else
    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
#endif

#if defined(JSON_HEDLEY_RESTRICT)
    #undef JSON_HEDLEY_RESTRICT
#endif
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
    #define JSON_HEDLEY_RESTRICT restrict
#elif \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
    defined(__clang__) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_RESTRICT __restrict
#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
    #define JSON_HEDLEY_RESTRICT _Restrict
#else
    #define JSON_HEDLEY_RESTRICT
#endif

#if defined(JSON_HEDLEY_INLINE)
    #undef JSON_HEDLEY_INLINE
#endif
#if \
    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
    (defined(__cplusplus) && (__cplusplus >= 199711L))
    #define JSON_HEDLEY_INLINE inline
#elif \
    defined(JSON_HEDLEY_GCC_VERSION) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
    #define JSON_HEDLEY_INLINE __inline__
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_INLINE __inline
#else
    #define JSON_HEDLEY_INLINE
#endif

#if defined(JSON_HEDLEY_ALWAYS_INLINE)
    #undef JSON_HEDLEY_ALWAYS_INLINE
#endif
#if \
  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
  JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
#elif \
  JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
#elif defined(__cplusplus) && \
    ( \
      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
    )
#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
#else
#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
#endif

#if defined(JSON_HEDLEY_NEVER_INLINE)
    #undef JSON_HEDLEY_NEVER_INLINE
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
#else
    #define JSON_HEDLEY_NEVER_INLINE
#endif

#if defined(JSON_HEDLEY_PRIVATE)
    #undef JSON_HEDLEY_PRIVATE
#endif
#if defined(JSON_HEDLEY_PUBLIC)
    #undef JSON_HEDLEY_PUBLIC
#endif
#if defined(JSON_HEDLEY_IMPORT)
    #undef JSON_HEDLEY_IMPORT
#endif
#if defined(_WIN32) || defined(__CYGWIN__)
#  define JSON_HEDLEY_PRIVATE
#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
#else
#  if \
    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
    ( \
      defined(__TI_EABI__) && \
      ( \
        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
      ) \
    ) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
#  else
#    define JSON_HEDLEY_PRIVATE
#    define JSON_HEDLEY_PUBLIC
#  endif
#  define JSON_HEDLEY_IMPORT    extern
#endif

#if defined(JSON_HEDLEY_NO_THROW)
    #undef JSON_HEDLEY_NO_THROW
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
#elif \
    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
#else
    #define JSON_HEDLEY_NO_THROW
#endif

#if defined(JSON_HEDLEY_FALL_THROUGH)
    #undef JSON_HEDLEY_FALL_THROUGH
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
#elif defined(__fallthrough) /* SAL */
    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
#else
    #define JSON_HEDLEY_FALL_THROUGH
#endif

#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
    #undef JSON_HEDLEY_RETURNS_NON_NULL
#endif
#if \
    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
#elif defined(_Ret_notnull_) /* SAL */
    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
#else
    #define JSON_HEDLEY_RETURNS_NON_NULL
#endif

#if defined(JSON_HEDLEY_ARRAY_PARAM)
    #undef JSON_HEDLEY_ARRAY_PARAM
#endif
#if \
    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(__STDC_NO_VLA__) && \
    !defined(__cplusplus) && \
    !defined(JSON_HEDLEY_PGI_VERSION) && \
    !defined(JSON_HEDLEY_TINYC_VERSION)
    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
#else
    #define JSON_HEDLEY_ARRAY_PARAM(name)
#endif

#if defined(JSON_HEDLEY_IS_CONSTANT)
    #undef JSON_HEDLEY_IS_CONSTANT
#endif
#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
#endif
/* JSON_HEDLEY_IS_CONSTEXPR_ is for
   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
    #undef JSON_HEDLEY_IS_CONSTEXPR_
#endif
#if \
    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
#endif
#if !defined(__cplusplus)
#  if \
       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
#if defined(__INTPTR_TYPE__)
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
#else
    #include <stdint.h>
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
#endif
#  elif \
       ( \
          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
          !defined(JSON_HEDLEY_PGI_VERSION) && \
          !defined(JSON_HEDLEY_IAR_VERSION)) || \
       (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
#if defined(__INTPTR_TYPE__)
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
#else
    #include <stdint.h>
    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
#endif
#  elif \
       defined(JSON_HEDLEY_GCC_VERSION) || \
       defined(JSON_HEDLEY_INTEL_VERSION) || \
       defined(JSON_HEDLEY_TINYC_VERSION) || \
       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
       defined(__clang__)
#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
        sizeof(void) != \
        sizeof(*( \
                  1 ? \
                  ((void*) ((expr) * 0L) ) : \
((struct { char v[sizeof(void) * 2]; } *) 1) \
                ) \
              ) \
                                            )
#  endif
#endif
#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
    #if !defined(JSON_HEDLEY_IS_CONSTANT)
        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
    #endif
    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
#else
    #if !defined(JSON_HEDLEY_IS_CONSTANT)
        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
    #endif
    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
#endif

#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
    #undef JSON_HEDLEY_BEGIN_C_DECLS
#endif
#if defined(JSON_HEDLEY_END_C_DECLS)
    #undef JSON_HEDLEY_END_C_DECLS
#endif
#if defined(JSON_HEDLEY_C_DECL)
    #undef JSON_HEDLEY_C_DECL
#endif
#if defined(__cplusplus)
    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
    #define JSON_HEDLEY_END_C_DECLS }
    #define JSON_HEDLEY_C_DECL extern "C"
#else
    #define JSON_HEDLEY_BEGIN_C_DECLS
    #define JSON_HEDLEY_END_C_DECLS
    #define JSON_HEDLEY_C_DECL
#endif

#if defined(JSON_HEDLEY_STATIC_ASSERT)
    #undef JSON_HEDLEY_STATIC_ASSERT
#endif
#if \
  !defined(__cplusplus) && ( \
      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
      (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
      defined(_Static_assert) \
    )
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
#elif \
  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
#else
#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
#endif

#if defined(JSON_HEDLEY_NULL)
    #undef JSON_HEDLEY_NULL
#endif
#if defined(__cplusplus)
    #if __cplusplus >= 201103L
        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
    #elif defined(NULL)
        #define JSON_HEDLEY_NULL NULL
    #else
        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
    #endif
#elif defined(NULL)
    #define JSON_HEDLEY_NULL NULL
#else
    #define JSON_HEDLEY_NULL ((void*) 0)
#endif

#if defined(JSON_HEDLEY_MESSAGE)
    #undef JSON_HEDLEY_MESSAGE
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
#  define JSON_HEDLEY_MESSAGE(msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
    JSON_HEDLEY_PRAGMA(message msg) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#elif \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
#else
#  define JSON_HEDLEY_MESSAGE(msg)
#endif

#if defined(JSON_HEDLEY_WARNING)
    #undef JSON_HEDLEY_WARNING
#endif
#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
#  define JSON_HEDLEY_WARNING(msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
    JSON_HEDLEY_PRAGMA(clang warning msg) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#elif \
  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
#elif \
  JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
#else
#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
#endif

#if defined(JSON_HEDLEY_REQUIRE)
    #undef JSON_HEDLEY_REQUIRE
#endif
#if defined(JSON_HEDLEY_REQUIRE_MSG)
    #undef JSON_HEDLEY_REQUIRE_MSG
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
#    define JSON_HEDLEY_REQUIRE(expr) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
    JSON_HEDLEY_DIAGNOSTIC_PUSH \
    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
    __attribute__((diagnose_if(!(expr), msg, "error"))) \
    JSON_HEDLEY_DIAGNOSTIC_POP
#  else
#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
#  endif
#else
#  define JSON_HEDLEY_REQUIRE(expr)
#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
#endif

#if defined(JSON_HEDLEY_FLAGS)
    #undef JSON_HEDLEY_FLAGS
#endif
#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
#else
    #define JSON_HEDLEY_FLAGS
#endif

#if defined(JSON_HEDLEY_FLAGS_CAST)
    #undef JSON_HEDLEY_FLAGS_CAST
#endif
#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
        JSON_HEDLEY_DIAGNOSTIC_PUSH \
        _Pragma("warning(disable:188)") \
        ((T) (expr)); \
        JSON_HEDLEY_DIAGNOSTIC_POP \
    }))
#else
#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
#endif

#if defined(JSON_HEDLEY_EMPTY_BASES)
    #undef JSON_HEDLEY_EMPTY_BASES
#endif
#if \
    (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
#else
    #define JSON_HEDLEY_EMPTY_BASES
#endif

/* Remaining macros are deprecated. */

#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
#endif
#if defined(__clang__)
    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
#else
    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
#endif

#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
#endif
#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)

#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
#endif
#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)

#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
#endif
#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)

#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
#endif
#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)

#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
    #undef JSON_HEDLEY_CLANG_HAS_WARNING
#endif
#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)

#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */


// This file contains all internal macro definitions
// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them

// exclude unsupported compilers
#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
    #if defined(__clang__)
        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
        #endif
    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
        #endif
    #endif
#endif

// C++ language standard detection
#if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
    #define JSON_HAS_CPP_20
    #define JSON_HAS_CPP_17
    #define JSON_HAS_CPP_14
#elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
    #define JSON_HAS_CPP_17
    #define JSON_HAS_CPP_14
#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
    #define JSON_HAS_CPP_14
#endif

// disable documentation warnings on clang
#if defined(__clang__)
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wdocumentation"
#endif

// allow to disable exceptions
#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
    #define JSON_THROW(exception) throw exception
    #define JSON_TRY try
    #define JSON_CATCH(exception) catch(exception)
    #define JSON_INTERNAL_CATCH(exception) catch(exception)
#else
    #include <cstdlib>
    #define JSON_THROW(exception) std::abort()
    #define JSON_TRY if(true)
    #define JSON_CATCH(exception) if(false)
    #define JSON_INTERNAL_CATCH(exception) if(false)
#endif

// override exception macros
#if defined(JSON_THROW_USER)
    #undef JSON_THROW
    #define JSON_THROW JSON_THROW_USER
#endif
#if defined(JSON_TRY_USER)
    #undef JSON_TRY
    #define JSON_TRY JSON_TRY_USER
#endif
#if defined(JSON_CATCH_USER)
    #undef JSON_CATCH
    #define JSON_CATCH JSON_CATCH_USER
    #undef JSON_INTERNAL_CATCH
    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
#endif
#if defined(JSON_INTERNAL_CATCH_USER)
    #undef JSON_INTERNAL_CATCH
    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
#endif

// allow to override assert
#if !defined(JSON_ASSERT)
    #include <cassert> // assert
    #define JSON_ASSERT(x) assert(x)
#endif

// allow to access some private functions (needed by the test suite)
#if defined(JSON_TESTS_PRIVATE)
    #define JSON_PRIVATE_UNLESS_TESTED public
#else
    #define JSON_PRIVATE_UNLESS_TESTED private
#endif

/*!
@brief macro to briefly define a mapping between an enum and JSON
@def NLOHMANN_JSON_SERIALIZE_ENUM
@since version 3.4.0
*/
#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
    template<typename BasicJsonType>                                                            \
    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
    {                                                                                           \
        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
        auto it = std::find_if(std::begin(m), std::end(m),                                      \
                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
        {                                                                                       \
            return ej_pair.first == e;                                                          \
        });                                                                                     \
        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
    }                                                                                           \
    template<typename BasicJsonType>                                                            \
    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
    {                                                                                           \
        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
        auto it = std::find_if(std::begin(m), std::end(m),                                      \
                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
        {                                                                                       \
            return ej_pair.second == j;                                                         \
        });                                                                                     \
        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
    }

// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
// may be removed in the future once the class is split.

#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
    template<template<typename, typename, typename...> class ObjectType,   \
             template<typename, typename...> class ArrayType,              \
             class StringType, class BooleanType, class NumberIntegerType, \
             class NumberUnsignedType, class NumberFloatType,              \
             template<typename> class AllocatorType,                       \
             template<typename, typename = void> class JSONSerializer,     \
             class BinaryType>

#define NLOHMANN_BASIC_JSON_TPL                                            \
    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
    AllocatorType, JSONSerializer, BinaryType>

// Macros to simplify conversion from/to types

#define NLOHMANN_JSON_EXPAND( x ) x
#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
        NLOHMANN_JSON_PASTE64, \
        NLOHMANN_JSON_PASTE63, \
        NLOHMANN_JSON_PASTE62, \
        NLOHMANN_JSON_PASTE61, \
        NLOHMANN_JSON_PASTE60, \
        NLOHMANN_JSON_PASTE59, \
        NLOHMANN_JSON_PASTE58, \
        NLOHMANN_JSON_PASTE57, \
        NLOHMANN_JSON_PASTE56, \
        NLOHMANN_JSON_PASTE55, \
        NLOHMANN_JSON_PASTE54, \
        NLOHMANN_JSON_PASTE53, \
        NLOHMANN_JSON_PASTE52, \
        NLOHMANN_JSON_PASTE51, \
        NLOHMANN_JSON_PASTE50, \
        NLOHMANN_JSON_PASTE49, \
        NLOHMANN_JSON_PASTE48, \
        NLOHMANN_JSON_PASTE47, \
        NLOHMANN_JSON_PASTE46, \
        NLOHMANN_JSON_PASTE45, \
        NLOHMANN_JSON_PASTE44, \
        NLOHMANN_JSON_PASTE43, \
        NLOHMANN_JSON_PASTE42, \
        NLOHMANN_JSON_PASTE41, \
        NLOHMANN_JSON_PASTE40, \
        NLOHMANN_JSON_PASTE39, \
        NLOHMANN_JSON_PASTE38, \
        NLOHMANN_JSON_PASTE37, \
        NLOHMANN_JSON_PASTE36, \
        NLOHMANN_JSON_PASTE35, \
        NLOHMANN_JSON_PASTE34, \
        NLOHMANN_JSON_PASTE33, \
        NLOHMANN_JSON_PASTE32, \
        NLOHMANN_JSON_PASTE31, \
        NLOHMANN_JSON_PASTE30, \
        NLOHMANN_JSON_PASTE29, \
        NLOHMANN_JSON_PASTE28, \
        NLOHMANN_JSON_PASTE27, \
        NLOHMANN_JSON_PASTE26, \
        NLOHMANN_JSON_PASTE25, \
        NLOHMANN_JSON_PASTE24, \
        NLOHMANN_JSON_PASTE23, \
        NLOHMANN_JSON_PASTE22, \
        NLOHMANN_JSON_PASTE21, \
        NLOHMANN_JSON_PASTE20, \
        NLOHMANN_JSON_PASTE19, \
        NLOHMANN_JSON_PASTE18, \
        NLOHMANN_JSON_PASTE17, \
        NLOHMANN_JSON_PASTE16, \
        NLOHMANN_JSON_PASTE15, \
        NLOHMANN_JSON_PASTE14, \
        NLOHMANN_JSON_PASTE13, \
        NLOHMANN_JSON_PASTE12, \
        NLOHMANN_JSON_PASTE11, \
        NLOHMANN_JSON_PASTE10, \
        NLOHMANN_JSON_PASTE9, \
        NLOHMANN_JSON_PASTE8, \
        NLOHMANN_JSON_PASTE7, \
        NLOHMANN_JSON_PASTE6, \
        NLOHMANN_JSON_PASTE5, \
        NLOHMANN_JSON_PASTE4, \
        NLOHMANN_JSON_PASTE3, \
        NLOHMANN_JSON_PASTE2, \
        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)

#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);

/*!
@brief macro
@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
@since version 3.9.0
*/
#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

/*!
@brief macro
@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
@since version 3.9.0
*/
#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }

#ifndef JSON_USE_IMPLICIT_CONVERSIONS
    #define JSON_USE_IMPLICIT_CONVERSIONS 1
#endif

#if JSON_USE_IMPLICIT_CONVERSIONS
    #define JSON_EXPLICIT
#else
    #define JSON_EXPLICIT explicit
#endif


namespace nlohmann
{
namespace detail
{

/*!
@brief replace all occurrences of a substring by another string

@param[in,out] s  the string to manipulate; changed so that all
               occurrences of @a f are replaced with @a t
@param[in]     f  the substring to replace with @a t
@param[in]     t  the string to replace @a f

@pre The search string @a f must not be empty. **This precondition is
enforced with an assertion.**

@since version 2.0.0
*/
inline void replace_substring(std::string& s, const std::string& f,
                              const std::string& t)
{
    JSON_ASSERT(!f.empty());
    for (auto pos = s.find(f);                // find first occurrence of f
            pos != std::string::npos;         // make sure f was found
            s.replace(pos, f.size(), t),      // replace with t, and
            pos = s.find(f, pos + t.size()))  // find next occurrence of f
    {}
}

/*!
 * @brief string escaping as described in RFC 6901 (Sect. 4)
 * @param[in] s string to escape
 * @return    escaped string
 *
 * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
 */
inline std::string escape(std::string s)
{
    replace_substring(s, "~", "~0");
    replace_substring(s, "/", "~1");
    return s;
}

/*!
 * @brief string unescaping as described in RFC 6901 (Sect. 4)
 * @param[in] s string to unescape
 * @return    unescaped string
 *
 * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
 */
static void unescape(std::string& s)
{
    replace_substring(s, "~1", "/");
    replace_substring(s, "~0", "~");
}

} // namespace detail
} // namespace nlohmann

// #include <nlohmann/detail/input/position_t.hpp>


#include <cstddef> // size_t

namespace nlohmann
{
namespace detail
{
/// struct to capture the start position of the current token
struct position_t
{
    /// the total number of characters read
    std::size_t chars_read_total = 0;
    /// the number of characters read in the current line
    std::size_t chars_read_current_line = 0;
    /// the number of lines read
    std::size_t lines_read = 0;

    /// conversion to size_t to preserve SAX interface
    constexpr operator size_t() const
    {
        return chars_read_total;
    }
};

} // namespace detail
} // namespace nlohmann

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{
////////////////
// exceptions //
////////////////

/*!
@brief general exception of the @ref basic_json class

This class is an extension of `std::exception` objects with a member @a id for
exception ids. It is used as the base class for all exceptions thrown by the
@ref basic_json class. This class can hence be used as "wildcard" to catch
exceptions.

Subclasses:
- @ref parse_error for exceptions indicating a parse error
- @ref invalid_iterator for exceptions indicating errors with iterators
- @ref type_error for exceptions indicating executing a member function with
                  a wrong type
- @ref out_of_range for exceptions indicating access out of the defined range
- @ref other_error for exceptions indicating other library errors

@internal
@note To have nothrow-copy-constructible exceptions, we internally use
      `std::runtime_error` which can cope with arbitrary-length error messages.
      Intermediate strings are built with static functions and then passed to
      the actual constructor.
@endinternal

@liveexample{The following code shows how arbitrary library exceptions can be
caught.,exception}

@since version 3.0.0
*/
class exception : public std::exception
{
  public:
    /// returns the explanatory string
    const char* what() const noexcept override
    {
        return m.what();
    }

    /// the id of the exception
    const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)

  protected:
    JSON_HEDLEY_NON_NULL(3)
    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}

    static std::string name(const std::string& ename, int id_)
    {
        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
    }

    template<typename BasicJsonType>
    static std::string diagnostics(const BasicJsonType& leaf_element)
    {
#if JSON_DIAGNOSTICS
        std::vector<std::string> tokens;
        for (const auto* current = &leaf_element; current->m_parent != nullptr; current = current->m_parent)
        {
            switch (current->m_parent->type())
            {
                case value_t::array:
                {
                    for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i)
                    {
                        if (&current->m_parent->m_value.array->operator[](i) == current)
                        {
                            tokens.emplace_back(std::to_string(i));
                            break;
                        }
                    }
                    break;
                }

                case value_t::object:
                {
                    for (const auto& element : *current->m_parent->m_value.object)
                    {
                        if (&element.second == current)
                        {
                            tokens.emplace_back(element.first.c_str());
                            break;
                        }
                    }
                    break;
                }

                default:   // LCOV_EXCL_LINE
                    break; // LCOV_EXCL_LINE
            }
        }

        if (tokens.empty())
        {
            return "";
        }

        return "(" + std::accumulate(tokens.rbegin(), tokens.rend(), std::string{},
                                     [](const std::string & a, const std::string & b)
        {
            return a + "/" + detail::escape(b);
        }) + ") ";
#else
        static_cast<void>(leaf_element);
        return "";
#endif
    }

  private:
    /// an exception object as storage for error messages
    std::runtime_error m;
};

/*!
@brief exception indicating a parse error

This exception is thrown by the library when a parse error occurs. Parse errors
can occur during the deserialization of JSON text, CBOR, MessagePack, as well
as when using JSON Patch.

Member @a byte holds the byte index of the last read character in the input
file.

Exceptions have ids 1xx.

name / id                      | example message | description
------------------------------ | --------------- | -------------------------
json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position.
json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point.
json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet).
json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed.

@note For an input with n bytes, 1 is the index of the first character and n+1
      is the index of the terminating null byte or the end of file. This also
      holds true when reading a byte vector (CBOR or MessagePack).

@liveexample{The following code shows how a `parse_error` exception can be
caught.,parse_error}

@sa - @ref exception for the base class of the library exceptions
@sa - @ref invalid_iterator for exceptions indicating errors with iterators
@sa - @ref type_error for exceptions indicating executing a member function with
                    a wrong type
@sa - @ref out_of_range for exceptions indicating access out of the defined range
@sa - @ref other_error for exceptions indicating other library errors

@since version 3.0.0
*/
class parse_error : public exception
{
  public:
    /*!
    @brief create a parse error exception
    @param[in] id_       the id of the exception
    @param[in] pos       the position where the error occurred (or with
                         chars_read_total=0 if the position cannot be
                         determined)
    @param[in] what_arg  the explanatory string
    @return parse_error object
    */
    template<typename BasicJsonType>
    static parse_error create(int id_, const position_t& pos, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("parse_error", id_) + "parse error" +
                        position_string(pos) + ": " + exception::diagnostics(context) + what_arg;
        return parse_error(id_, pos.chars_read_total, w.c_str());
    }

    template<typename BasicJsonType>
    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("parse_error", id_) + "parse error" +
                        (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") +
                        ": " + exception::diagnostics(context) + what_arg;
        return parse_error(id_, byte_, w.c_str());
    }

    /*!
    @brief byte index of the parse error

    The byte index of the last read character in the input file.

    @note For an input with n bytes, 1 is the index of the first character and
          n+1 is the index of the terminating null byte or the end of file.
          This also holds true when reading a byte vector (CBOR or MessagePack).
    */
    const std::size_t byte;

  private:
    parse_error(int id_, std::size_t byte_, const char* what_arg)
        : exception(id_, what_arg), byte(byte_) {}

    static std::string position_string(const position_t& pos)
    {
        return " at line " + std::to_string(pos.lines_read + 1) +
               ", column " + std::to_string(pos.chars_read_current_line);
    }
};

/*!
@brief exception indicating errors with iterators

This exception is thrown if iterators passed to a library function do not match
the expected semantics.

Exceptions have ids 2xx.

name / id                           | example message | description
----------------------------------- | --------------- | -------------------------
json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion.
json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from.
json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid.
json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid.
json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range.
json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key.
json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to.
json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container.
json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered.
json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin().

@liveexample{The following code shows how an `invalid_iterator` exception can be
caught.,invalid_iterator}

@sa - @ref exception for the base class of the library exceptions
@sa - @ref parse_error for exceptions indicating a parse error
@sa - @ref type_error for exceptions indicating executing a member function with
                    a wrong type
@sa - @ref out_of_range for exceptions indicating access out of the defined range
@sa - @ref other_error for exceptions indicating other library errors

@since version 3.0.0
*/
class invalid_iterator : public exception
{
  public:
    template<typename BasicJsonType>
    static invalid_iterator create(int id_, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("invalid_iterator", id_) + exception::diagnostics(context) + what_arg;
        return invalid_iterator(id_, w.c_str());
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    invalid_iterator(int id_, const char* what_arg)
        : exception(id_, what_arg) {}
};

/*!
@brief exception indicating executing a member function with a wrong type

This exception is thrown in case of a type error; that is, a library function is
executed on a JSON value whose type does not match the expected semantics.

Exceptions have ids 3xx.

name / id                     | example message | description
----------------------------- | --------------- | -------------------------
json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead.
json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types.
json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &.
json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types.
json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types.
json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types.
json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types.
json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types.
json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types.
json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types.
json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types.
json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types.
json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) |

@liveexample{The following code shows how a `type_error` exception can be
caught.,type_error}

@sa - @ref exception for the base class of the library exceptions
@sa - @ref parse_error for exceptions indicating a parse error
@sa - @ref invalid_iterator for exceptions indicating errors with iterators
@sa - @ref out_of_range for exceptions indicating access out of the defined range
@sa - @ref other_error for exceptions indicating other library errors

@since version 3.0.0
*/
class type_error : public exception
{
  public:
    template<typename BasicJsonType>
    static type_error create(int id_, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("type_error", id_) + exception::diagnostics(context) + what_arg;
        return type_error(id_, w.c_str());
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
};

/*!
@brief exception indicating access out of the defined range

This exception is thrown in case a library function is called on an input
parameter that exceeds the expected range, for instance in case of array
indices or nonexisting object keys.

Exceptions have ids 4xx.

name / id                       | example message | description
------------------------------- | --------------- | -------------------------
json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1.
json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it.
json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object.
json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) |
json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string |

@liveexample{The following code shows how an `out_of_range` exception can be
caught.,out_of_range}

@sa - @ref exception for the base class of the library exceptions
@sa - @ref parse_error for exceptions indicating a parse error
@sa - @ref invalid_iterator for exceptions indicating errors with iterators
@sa - @ref type_error for exceptions indicating executing a member function with
                    a wrong type
@sa - @ref other_error for exceptions indicating other library errors

@since version 3.0.0
*/
class out_of_range : public exception
{
  public:
    template<typename BasicJsonType>
    static out_of_range create(int id_, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("out_of_range", id_) + exception::diagnostics(context) + what_arg;
        return out_of_range(id_, w.c_str());
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
};

/*!
@brief exception indicating other library errors

This exception is thrown in case of errors that cannot be classified with the
other exception types.

Exceptions have ids 5xx.

name / id                      | example message | description
------------------------------ | --------------- | -------------------------
json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.

@sa - @ref exception for the base class of the library exceptions
@sa - @ref parse_error for exceptions indicating a parse error
@sa - @ref invalid_iterator for exceptions indicating errors with iterators
@sa - @ref type_error for exceptions indicating executing a member function with
                    a wrong type
@sa - @ref out_of_range for exceptions indicating access out of the defined range

@liveexample{The following code shows how an `other_error` exception can be
caught.,other_error}

@since version 3.0.0
*/
class other_error : public exception
{
  public:
    template<typename BasicJsonType>
    static other_error create(int id_, const std::string& what_arg, const BasicJsonType& context)
    {
        std::string w = exception::name("other_error", id_) + exception::diagnostics(context) + what_arg;
        return other_error(id_, w.c_str());
    }

  private:
    JSON_HEDLEY_NON_NULL(3)
    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>


#include <cstddef> // size_t
#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
#include <utility> // index_sequence, make_index_sequence, index_sequence_for

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{

template<typename T>
using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;

#ifdef JSON_HAS_CPP_14

// the following utilities are natively available in C++14
using std::enable_if_t;
using std::index_sequence;
using std::make_index_sequence;
using std::index_sequence_for;

#else

// alias templates to reduce boilerplate
template<bool B, typename T = void>
using enable_if_t = typename std::enable_if<B, T>::type;

// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h
// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.

//// START OF CODE FROM GOOGLE ABSEIL

// integer_sequence
//
// Class template representing a compile-time integer sequence. An instantiation
// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
// type through its template arguments (which is a common need when
// working with C++11 variadic templates). `absl::integer_sequence` is designed
// to be a drop-in replacement for C++14's `std::integer_sequence`.
//
// Example:
//
//   template< class T, T... Ints >
//   void user_function(integer_sequence<T, Ints...>);
//
//   int main()
//   {
//     // user_function's `T` will be deduced to `int` and `Ints...`
//     // will be deduced to `0, 1, 2, 3, 4`.
//     user_function(make_integer_sequence<int, 5>());
//   }
template <typename T, T... Ints>
struct integer_sequence
{
    using value_type = T;
    static constexpr std::size_t size() noexcept
    {
        return sizeof...(Ints);
    }
};

// index_sequence
//
// A helper template for an `integer_sequence` of `size_t`,
// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
// `std::index_sequence`.
template <size_t... Ints>
using index_sequence = integer_sequence<size_t, Ints...>;

namespace utility_internal
{

template <typename Seq, size_t SeqSize, size_t Rem>
struct Extend;

// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
template <typename T, T... Ints, size_t SeqSize>
struct Extend<integer_sequence<T, Ints...>, SeqSize, 0>
{
    using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >;
};

template <typename T, T... Ints, size_t SeqSize>
struct Extend<integer_sequence<T, Ints...>, SeqSize, 1>
{
    using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >;
};

// Recursion helper for 'make_integer_sequence<T, N>'.
// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
template <typename T, size_t N>
struct Gen
{
    using type =
        typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type;
};

template <typename T>
struct Gen<T, 0>
{
    using type = integer_sequence<T>;
};

}  // namespace utility_internal

// Compile-time sequences of integers

// make_integer_sequence
//
// This template alias is equivalent to
// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
// replacement for C++14's `std::make_integer_sequence`.
template <typename T, T N>
using make_integer_sequence = typename utility_internal::Gen<T, N>::type;

// make_index_sequence
//
// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
// and is designed to be a drop-in replacement for C++14's
// `std::make_index_sequence`.
template <size_t N>
using make_index_sequence = make_integer_sequence<size_t, N>;

// index_sequence_for
//
// Converts a typename pack into an index sequence of the same length, and
// is designed to be a drop-in replacement for C++14's
// `std::index_sequence_for()`
template <typename... Ts>
using index_sequence_for = make_index_sequence<sizeof...(Ts)>;

//// END OF CODE FROM GOOGLE ABSEIL

#endif

// dispatch utility (taken from ranges-v3)
template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
template<> struct priority_tag<0> {};

// taken from ranges-v3
template<typename T>
struct static_const
{
    static constexpr T value{};
};

template<typename T>
constexpr T static_const<T>::value;

}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/meta/type_traits.hpp>


#include <limits> // numeric_limits
#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
#include <utility> // declval
#include <tuple> // tuple

// #include <nlohmann/detail/iterators/iterator_traits.hpp>


#include <iterator> // random_access_iterator_tag

// #include <nlohmann/detail/meta/void_t.hpp>


namespace nlohmann
{
namespace detail
{
template<typename ...Ts> struct make_void
{
    using type = void;
};
template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
} // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/meta/cpp_future.hpp>


namespace nlohmann
{
namespace detail
{
template<typename It, typename = void>
struct iterator_types {};

template<typename It>
struct iterator_types <
    It,
    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
    typename It::reference, typename It::iterator_category >>
{
    using difference_type = typename It::difference_type;
    using value_type = typename It::value_type;
    using pointer = typename It::pointer;
    using reference = typename It::reference;
    using iterator_category = typename It::iterator_category;
};

// This is required as some compilers implement std::iterator_traits in a way that
// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
template<typename T, typename = void>
struct iterator_traits
{
};

template<typename T>
struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
            : iterator_types<T>
{
};

template<typename T>
struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
{
    using iterator_category = std::random_access_iterator_tag;
    using value_type = T;
    using difference_type = ptrdiff_t;
    using pointer = T*;
    using reference = T&;
};
} // namespace detail
} // namespace nlohmann

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/detected.hpp>


#include <type_traits>

// #include <nlohmann/detail/meta/void_t.hpp>


// https://en.cppreference.com/w/cpp/experimental/is_detected
namespace nlohmann
{
namespace detail
{
struct nonesuch
{
    nonesuch() = delete;
    ~nonesuch() = delete;
    nonesuch(nonesuch const&) = delete;
    nonesuch(nonesuch const&&) = delete;
    void operator=(nonesuch const&) = delete;
    void operator=(nonesuch&&) = delete;
};

template<class Default,
         class AlwaysVoid,
         template<class...> class Op,
         class... Args>
struct detector
{
    using value_t = std::false_type;
    using type = Default;
};

template<class Default, template<class...> class Op, class... Args>
struct detector<Default, void_t<Op<Args...>>, Op, Args...>
{
    using value_t = std::true_type;
    using type = Op<Args...>;
};

template<template<class...> class Op, class... Args>
using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;

template<template<class...> class Op, class... Args>
using detected_t = typename detector<nonesuch, void, Op, Args...>::type;

template<class Default, template<class...> class Op, class... Args>
using detected_or = detector<Default, void, Op, Args...>;

template<class Default, template<class...> class Op, class... Args>
using detected_or_t = typename detected_or<Default, Op, Args...>::type;

template<class Expected, template<class...> class Op, class... Args>
using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;

template<class To, template<class...> class Op, class... Args>
using is_detected_convertible =
    std::is_convertible<detected_t<Op, Args...>, To>;
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/json_fwd.hpp>
#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
#define INCLUDE_NLOHMANN_JSON_FWD_HPP_

#include <cstdint> // int64_t, uint64_t
#include <map> // map
#include <memory> // allocator
#include <string> // string
#include <vector> // vector

/*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
*/
namespace nlohmann
{
/*!
@brief default JSONSerializer template argument

This serializer ignores the template arguments and uses ADL
([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
for serialization.
*/
template<typename T = void, typename SFINAE = void>
struct adl_serializer;

template<template<typename U, typename V, typename... Args> class ObjectType =
         std::map,
         template<typename U, typename... Args> class ArrayType = std::vector,
         class StringType = std::string, class BooleanType = bool,
         class NumberIntegerType = std::int64_t,
         class NumberUnsignedType = std::uint64_t,
         class NumberFloatType = double,
         template<typename U> class AllocatorType = std::allocator,
         template<typename T, typename SFINAE = void> class JSONSerializer =
         adl_serializer,
         class BinaryType = std::vector<std::uint8_t>>
class basic_json;

/*!
@brief JSON Pointer

A JSON pointer defines a string syntax for identifying a specific value
within a JSON document. It can be used with functions `at` and
`operator[]`. Furthermore, JSON pointers are the base for JSON patches.

@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)

@since version 2.0.0
*/
template<typename BasicJsonType>
class json_pointer;

/*!
@brief default JSON class

This type is the default specialization of the @ref basic_json class which
uses the standard template types.

@since version 1.0.0
*/
using json = basic_json<>;

template<class Key, class T, class IgnoredLess, class Allocator>
struct ordered_map;

/*!
@brief ordered JSON class

This type preserves the insertion order of object keys.

@since version 3.9.0
*/
using ordered_json = basic_json<nlohmann::ordered_map>;

}  // namespace nlohmann

#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_


namespace nlohmann
{
/*!
@brief detail namespace with internal helper functions

This namespace collects functions that should not be exposed,
implementations of some @ref basic_json methods, and meta-programming helpers.

@since version 2.1.0
*/
namespace detail
{
/////////////
// helpers //
/////////////

// Note to maintainers:
//
// Every trait in this file expects a non CV-qualified type.
// The only exceptions are in the 'aliases for detected' section
// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
//
// In this case, T has to be properly CV-qualified to constraint the function arguments
// (e.g. to_json(BasicJsonType&, const T&))

template<typename> struct is_basic_json : std::false_type {};

NLOHMANN_BASIC_JSON_TPL_DECLARATION
struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};

//////////////////////
// json_ref helpers //
//////////////////////

template<typename>
class json_ref;

template<typename>
struct is_json_ref : std::false_type {};

template<typename T>
struct is_json_ref<json_ref<T>> : std::true_type {};

//////////////////////////
// aliases for detected //
//////////////////////////

template<typename T>
using mapped_type_t = typename T::mapped_type;

template<typename T>
using key_type_t = typename T::key_type;

template<typename T>
using value_type_t = typename T::value_type;

template<typename T>
using difference_type_t = typename T::difference_type;

template<typename T>
using pointer_t = typename T::pointer;

template<typename T>
using reference_t = typename T::reference;

template<typename T>
using iterator_category_t = typename T::iterator_category;

template<typename T>
using iterator_t = typename T::iterator;

template<typename T, typename... Args>
using to_json_function = decltype(T::to_json(std::declval<Args>()...));

template<typename T, typename... Args>
using from_json_function = decltype(T::from_json(std::declval<Args>()...));

template<typename T, typename U>
using get_template_function = decltype(std::declval<T>().template get<U>());

// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
template<typename BasicJsonType, typename T, typename = void>
struct has_from_json : std::false_type {};

// trait checking if j.get<T> is valid
// use this trait instead of std::is_constructible or std::is_convertible,
// both rely on, or make use of implicit conversions, and thus fail when T
// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
template <typename BasicJsonType, typename T>
struct is_getable
{
    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
};

template<typename BasicJsonType, typename T>
struct has_from_json < BasicJsonType, T,
           enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<void, from_json_function, serializer,
        const BasicJsonType&, T&>::value;
};

// This trait checks if JSONSerializer<T>::from_json(json const&) exists
// this overload is used for non-default-constructible user-defined-types
template<typename BasicJsonType, typename T, typename = void>
struct has_non_default_from_json : std::false_type {};

template<typename BasicJsonType, typename T>
struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<T, from_json_function, serializer,
        const BasicJsonType&>::value;
};

// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
template<typename BasicJsonType, typename T, typename = void>
struct has_to_json : std::false_type {};

template<typename BasicJsonType, typename T>
struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
{
    using serializer = typename BasicJsonType::template json_serializer<T, void>;

    static constexpr bool value =
        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
        T>::value;
};


///////////////////
// is_ functions //
///////////////////

template<typename T, typename = void>
struct is_iterator_traits : std::false_type {};

template<typename T>
struct is_iterator_traits<iterator_traits<T>>
{
  private:
    using traits = iterator_traits<T>;

  public:
    static constexpr auto value =
        is_detected<value_type_t, traits>::value &&
        is_detected<difference_type_t, traits>::value &&
        is_detected<pointer_t, traits>::value &&
        is_detected<iterator_category_t, traits>::value &&
        is_detected<reference_t, traits>::value;
};

// The following implementation of is_complete_type is taken from
// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
// and is written by Xiang Fan who agreed to using it in this library.

template<typename T, typename = void>
struct is_complete_type : std::false_type {};

template<typename T>
struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};

template<typename BasicJsonType, typename CompatibleObjectType,
         typename = void>
struct is_compatible_object_type_impl : std::false_type {};

template<typename BasicJsonType, typename CompatibleObjectType>
struct is_compatible_object_type_impl <
    BasicJsonType, CompatibleObjectType,
    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
    is_detected<key_type_t, CompatibleObjectType>::value >>
{
    using object_t = typename BasicJsonType::object_t;

    // macOS's is_constructible does not play well with nonesuch...
    static constexpr bool value =
        std::is_constructible<typename object_t::key_type,
        typename CompatibleObjectType::key_type>::value &&
        std::is_constructible<typename object_t::mapped_type,
        typename CompatibleObjectType::mapped_type>::value;
};

template<typename BasicJsonType, typename CompatibleObjectType>
struct is_compatible_object_type
    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};

template<typename BasicJsonType, typename ConstructibleObjectType,
         typename = void>
struct is_constructible_object_type_impl : std::false_type {};

template<typename BasicJsonType, typename ConstructibleObjectType>
struct is_constructible_object_type_impl <
    BasicJsonType, ConstructibleObjectType,
    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
    is_detected<key_type_t, ConstructibleObjectType>::value >>
{
    using object_t = typename BasicJsonType::object_t;

    static constexpr bool value =
        (std::is_default_constructible<ConstructibleObjectType>::value &&
         (std::is_move_assignable<ConstructibleObjectType>::value ||
          std::is_copy_assignable<ConstructibleObjectType>::value) &&
         (std::is_constructible<typename ConstructibleObjectType::key_type,
          typename object_t::key_type>::value &&
          std::is_same <
          typename object_t::mapped_type,
          typename ConstructibleObjectType::mapped_type >::value)) ||
        (has_from_json<BasicJsonType,
         typename ConstructibleObjectType::mapped_type>::value ||
         has_non_default_from_json <
         BasicJsonType,
         typename ConstructibleObjectType::mapped_type >::value);
};

template<typename BasicJsonType, typename ConstructibleObjectType>
struct is_constructible_object_type
    : is_constructible_object_type_impl<BasicJsonType,
      ConstructibleObjectType> {};

template<typename BasicJsonType, typename CompatibleStringType,
         typename = void>
struct is_compatible_string_type_impl : std::false_type {};

template<typename BasicJsonType, typename CompatibleStringType>
struct is_compatible_string_type_impl <
    BasicJsonType, CompatibleStringType,
    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
    value_type_t, CompatibleStringType>::value >>
{
    static constexpr auto value =
        std::is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
};

template<typename BasicJsonType, typename ConstructibleStringType>
struct is_compatible_string_type
    : is_compatible_string_type_impl<BasicJsonType, ConstructibleStringType> {};

template<typename BasicJsonType, typename ConstructibleStringType,
         typename = void>
struct is_constructible_string_type_impl : std::false_type {};

template<typename BasicJsonType, typename ConstructibleStringType>
struct is_constructible_string_type_impl <
    BasicJsonType, ConstructibleStringType,
    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
    value_type_t, ConstructibleStringType>::value >>
{
    static constexpr auto value =
        std::is_constructible<ConstructibleStringType,
        typename BasicJsonType::string_t>::value;
};

template<typename BasicJsonType, typename ConstructibleStringType>
struct is_constructible_string_type
    : is_constructible_string_type_impl<BasicJsonType, ConstructibleStringType> {};

template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
struct is_compatible_array_type_impl : std::false_type {};

template<typename BasicJsonType, typename CompatibleArrayType>
struct is_compatible_array_type_impl <
    BasicJsonType, CompatibleArrayType,
    enable_if_t < is_detected<value_type_t, CompatibleArrayType>::value&&
    is_detected<iterator_t, CompatibleArrayType>::value&&
// This is needed because json_reverse_iterator has a ::iterator type...
// Therefore it is detected as a CompatibleArrayType.
// The real fix would be to have an Iterable concept.
    !is_iterator_traits <
    iterator_traits<CompatibleArrayType >>::value >>
{
    static constexpr bool value =
        std::is_constructible<BasicJsonType,
        typename CompatibleArrayType::value_type>::value;
};

template<typename BasicJsonType, typename CompatibleArrayType>
struct is_compatible_array_type
    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};

template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
struct is_constructible_array_type_impl : std::false_type {};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type_impl <
    BasicJsonType, ConstructibleArrayType,
    enable_if_t<std::is_same<ConstructibleArrayType,
    typename BasicJsonType::value_type>::value >>
            : std::true_type {};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type_impl <
    BasicJsonType, ConstructibleArrayType,
    enable_if_t < !std::is_same<ConstructibleArrayType,
    typename BasicJsonType::value_type>::value&&
    std::is_default_constructible<ConstructibleArrayType>::value&&
(std::is_move_assignable<ConstructibleArrayType>::value ||
 std::is_copy_assignable<ConstructibleArrayType>::value)&&
is_detected<value_type_t, ConstructibleArrayType>::value&&
is_detected<iterator_t, ConstructibleArrayType>::value&&
is_complete_type <
detected_t<value_type_t, ConstructibleArrayType >>::value >>
{
    static constexpr bool value =
        // This is needed because json_reverse_iterator has a ::iterator type,
        // furthermore, std::back_insert_iterator (and other iterators) have a
        // base class `iterator`... Therefore it is detected as a
        // ConstructibleArrayType. The real fix would be to have an Iterable
        // concept.
        !is_iterator_traits<iterator_traits<ConstructibleArrayType>>::value &&

        (std::is_same<typename ConstructibleArrayType::value_type,
         typename BasicJsonType::array_t::value_type>::value ||
         has_from_json<BasicJsonType,
         typename ConstructibleArrayType::value_type>::value ||
         has_non_default_from_json <
         BasicJsonType, typename ConstructibleArrayType::value_type >::value);
};

template<typename BasicJsonType, typename ConstructibleArrayType>
struct is_constructible_array_type
    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};

template<typename RealIntegerType, typename CompatibleNumberIntegerType,
         typename = void>
struct is_compatible_integer_type_impl : std::false_type {};

template<typename RealIntegerType, typename CompatibleNumberIntegerType>
struct is_compatible_integer_type_impl <
    RealIntegerType, CompatibleNumberIntegerType,
    enable_if_t < std::is_integral<RealIntegerType>::value&&
    std::is_integral<CompatibleNumberIntegerType>::value&&
    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
{
    // is there an assert somewhere on overflows?
    using RealLimits = std::numeric_limits<RealIntegerType>;
    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;

    static constexpr auto value =
        std::is_constructible<RealIntegerType,
        CompatibleNumberIntegerType>::value &&
        CompatibleLimits::is_integer &&
        RealLimits::is_signed == CompatibleLimits::is_signed;
};

template<typename RealIntegerType, typename CompatibleNumberIntegerType>
struct is_compatible_integer_type
    : is_compatible_integer_type_impl<RealIntegerType,
      CompatibleNumberIntegerType> {};

template<typename BasicJsonType, typename CompatibleType, typename = void>
struct is_compatible_type_impl: std::false_type {};

template<typename BasicJsonType, typename CompatibleType>
struct is_compatible_type_impl <
    BasicJsonType, CompatibleType,
    enable_if_t<is_complete_type<CompatibleType>::value >>
{
    static constexpr bool value =
        has_to_json<BasicJsonType, CompatibleType>::value;
};

template<typename BasicJsonType, typename CompatibleType>
struct is_compatible_type
    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};

// https://en.cppreference.com/w/cpp/types/conjunction
template<class...> struct conjunction : std::true_type { };
template<class B1> struct conjunction<B1> : B1 { };
template<class B1, class... Bn>
struct conjunction<B1, Bn...>
: std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};

template<typename T1, typename T2>
struct is_constructible_tuple : std::false_type {};

template<typename T1, typename... Args>
struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<std::is_constructible<T1, Args>...> {};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
    {
        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name()), j));
    }
    n = nullptr;
}

// overloads for basic_json template parameters
template < typename BasicJsonType, typename ArithmeticType,
           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
                         int > = 0 >
void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
{
    switch (static_cast<value_t>(j))
    {
        case value_t::number_unsigned:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
            break;
        }
        case value_t::number_integer:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
            break;
        }
        case value_t::number_float:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
            break;
        }

        default:
            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j));
    }
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
    {
        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name()), j));
    }
    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
    {
        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j));
    }
    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
}

template <
    typename BasicJsonType, typename ConstructibleStringType,
    enable_if_t <
        is_constructible_string_type<BasicJsonType, ConstructibleStringType>::value&&
        !std::is_same<typename BasicJsonType::string_t,
                      ConstructibleStringType>::value,
        int > = 0 >
void from_json(const BasicJsonType& j, ConstructibleStringType& s)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
    {
        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j));
    }

    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
{
    get_arithmetic_value(j, val);
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
{
    get_arithmetic_value(j, val);
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
{
    get_arithmetic_value(j, val);
}

template<typename BasicJsonType, typename EnumType,
         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
void from_json(const BasicJsonType& j, EnumType& e)
{
    typename std::underlying_type<EnumType>::type val;
    get_arithmetic_value(j, val);
    e = static_cast<EnumType>(val);
}

// forward_list doesn't have an insert method
template<typename BasicJsonType, typename T, typename Allocator,
         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
    }
    l.clear();
    std::transform(j.rbegin(), j.rend(),
                   std::front_inserter(l), [](const BasicJsonType & i)
    {
        return i.template get<T>();
    });
}

// valarray doesn't have an insert method
template<typename BasicJsonType, typename T,
         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
void from_json(const BasicJsonType& j, std::valarray<T>& l)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
    }
    l.resize(j.size());
    std::transform(j.begin(), j.end(), std::begin(l),
                   [](const BasicJsonType & elem)
    {
        return elem.template get<T>();
    });
}

template<typename BasicJsonType, typename T, std::size_t N>
auto from_json(const BasicJsonType& j, T (&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
-> decltype(j.template get<T>(), void())
{
    for (std::size_t i = 0; i < N; ++i)
    {
        arr[i] = j.at(i).template get<T>();
    }
}

template<typename BasicJsonType>
void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
{
    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
}

template<typename BasicJsonType, typename T, std::size_t N>
auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
                          priority_tag<2> /*unused*/)
-> decltype(j.template get<T>(), void())
{
    for (std::size_t i = 0; i < N; ++i)
    {
        arr[i] = j.at(i).template get<T>();
    }
}

template<typename BasicJsonType, typename ConstructibleArrayType>
auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
-> decltype(
    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
    j.template get<typename ConstructibleArrayType::value_type>(),
    void())
{
    using std::end;

    ConstructibleArrayType ret;
    ret.reserve(j.size());
    std::transform(j.begin(), j.end(),
                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
    {
        // get<BasicJsonType>() returns *this, this won't call a from_json
        // method when value_type is BasicJsonType
        return i.template get<typename ConstructibleArrayType::value_type>();
    });
    arr = std::move(ret);
}

template<typename BasicJsonType, typename ConstructibleArrayType>
void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
                          priority_tag<0> /*unused*/)
{
    using std::end;

    ConstructibleArrayType ret;
    std::transform(
        j.begin(), j.end(), std::inserter(ret, end(ret)),
        [](const BasicJsonType & i)
    {
        // get<BasicJsonType>() returns *this, this won't call a from_json
        // method when value_type is BasicJsonType
        return i.template get<typename ConstructibleArrayType::value_type>();
    });
    arr = std::move(ret);
}

template < typename BasicJsonType, typename ConstructibleArrayType,
           enable_if_t <
               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
               !is_basic_json<ConstructibleArrayType>::value,
               int > = 0 >
auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
j.template get<typename ConstructibleArrayType::value_type>(),
void())
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
    }

    from_json_array_impl(j, arr, priority_tag<3> {});
}

template<typename BasicJsonType>
void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
    {
        JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(j.type_name()), j));
    }

    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
}

template<typename BasicJsonType, typename ConstructibleObjectType,
         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
    {
        JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name()), j));
    }

    ConstructibleObjectType ret;
    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
    using value_type = typename ConstructibleObjectType::value_type;
    std::transform(
        inner_object->begin(), inner_object->end(),
        std::inserter(ret, ret.begin()),
        [](typename BasicJsonType::object_t::value_type const & p)
    {
        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
    });
    obj = std::move(ret);
}

// overload for arithmetic types, not chosen for basic_json template arguments
// (BooleanType, etc..); note: Is it really necessary to provide explicit
// overloads for boolean_t etc. in case of a custom BooleanType which is not
// an arithmetic type?
template < typename BasicJsonType, typename ArithmeticType,
           enable_if_t <
               std::is_arithmetic<ArithmeticType>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
               int > = 0 >
void from_json(const BasicJsonType& j, ArithmeticType& val)
{
    switch (static_cast<value_t>(j))
    {
        case value_t::number_unsigned:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
            break;
        }
        case value_t::number_integer:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
            break;
        }
        case value_t::number_float:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
            break;
        }
        case value_t::boolean:
        {
            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
            break;
        }

        default:
            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j));
    }
}

template<typename BasicJsonType, typename A1, typename A2>
void from_json(const BasicJsonType& j, std::pair<A1, A2>& p)
{
    p = {j.at(0).template get<A1>(), j.at(1).template get<A2>()};
}

template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
void from_json_tuple_impl(const BasicJsonType& j, Tuple& t, index_sequence<Idx...> /*unused*/)
{
    t = std::make_tuple(j.at(Idx).template get<typename std::tuple_element<Idx, Tuple>::type>()...);
}

template<typename BasicJsonType, typename... Args>
void from_json(const BasicJsonType& j, std::tuple<Args...>& t)
{
    from_json_tuple_impl(j, t, index_sequence_for<Args...> {});
}

template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
           typename = enable_if_t < !std::is_constructible <
                                        typename BasicJsonType::string_t, Key >::value >>
void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
    }
    m.clear();
    for (const auto& p : j)
    {
        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
        {
            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j));
        }
        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
    }
}

template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
           typename = enable_if_t < !std::is_constructible <
                                        typename BasicJsonType::string_t, Key >::value >>
void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
{
    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
    {
        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
    }
    m.clear();
    for (const auto& p : j)
    {
        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
        {
            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j));
        }
        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
    }
}

struct from_json_fn
{
    template<typename BasicJsonType, typename T>
    auto operator()(const BasicJsonType& j, T& val) const
    noexcept(noexcept(from_json(j, val)))
    -> decltype(from_json(j, val), void())
    {
        return from_json(j, val);
    }
};
}  // namespace detail

/// namespace to hold default `from_json` function
/// to see why this is required:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
{
constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value; // NOLINT(misc-definitions-in-headers)
} // namespace
} // namespace nlohmann

// #include <nlohmann/detail/conversions/to_json.hpp>


#include <algorithm> // copy
#include <iterator> // begin, end
#include <string> // string
#include <tuple> // tuple, get
#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
#include <utility> // move, forward, declval, pair
#include <valarray> // valarray
#include <vector> // vector

// #include <nlohmann/detail/iterators/iteration_proxy.hpp>


#include <cstddef> // size_t
#include <iterator> // input_iterator_tag
#include <string> // string, to_string
#include <tuple> // tuple_size, get, tuple_element
#include <utility> // move

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
template<typename string_type>
void int_to_string( string_type& target, std::size_t value )
{
    // For ADL
    using std::to_string;
    target = to_string(value);
}
template<typename IteratorType> class iteration_proxy_value
{
  public:
    using difference_type = std::ptrdiff_t;
    using value_type = iteration_proxy_value;
    using pointer = value_type * ;
    using reference = value_type & ;
    using iterator_category = std::input_iterator_tag;
    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;

  private:
    /// the iterator
    IteratorType anchor;
    /// an index for arrays (used to create key names)
    std::size_t array_index = 0;
    /// last stringified array index
    mutable std::size_t array_index_last = 0;
    /// a string representation of the array index
    mutable string_type array_index_str = "0";
    /// an empty string (to return a reference for primitive values)
    const string_type empty_str{};

  public:
    explicit iteration_proxy_value(IteratorType it) noexcept
        : anchor(std::move(it))
    {}

    /// dereference operator (needed for range-based for)
    iteration_proxy_value& operator*()
    {
        return *this;
    }

    /// increment operator (needed for range-based for)
    iteration_proxy_value& operator++()
    {
        ++anchor;
        ++array_index;

        return *this;
    }

    /// equality operator (needed for InputIterator)
    bool operator==(const iteration_proxy_value& o) const
    {
        return anchor == o.anchor;
    }

    /// inequality operator (needed for range-based for)
    bool operator!=(const iteration_proxy_value& o) const
    {
        return anchor != o.anchor;
    }

    /// return key of the iterator
    const string_type& key() const
    {
        JSON_ASSERT(anchor.m_object != nullptr);

        switch (anchor.m_object->type())
        {
            // use integer array index as key
            case value_t::array:
            {
                if (array_index != array_index_last)
                {
                    int_to_string( array_index_str, array_index );
                    array_index_last = array_index;
                }
                return array_index_str;
            }

            // use key from the object
            case value_t::object:
                return anchor.key();

            // use an empty key for all primitive types
            default:
                return empty_str;
        }
    }

    /// return value of the iterator
    typename IteratorType::reference value() const
    {
        return anchor.value();
    }
};

/// proxy class for the items() function
template<typename IteratorType> class iteration_proxy
{
  private:
    /// the container to iterate
    typename IteratorType::reference container;

  public:
    /// construct iteration proxy from a container
    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
        : container(cont) {}

    /// return iterator begin (needed for range-based for)
    iteration_proxy_value<IteratorType> begin() noexcept
    {
        return iteration_proxy_value<IteratorType>(container.begin());
    }

    /// return iterator end (needed for range-based for)
    iteration_proxy_value<IteratorType> end() noexcept
    {
        return iteration_proxy_value<IteratorType>(container.end());
    }
};
// Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
{
    return i.key();
}
// Structured Bindings Support
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
{
    return i.value();
}
}  // namespace detail
}  // namespace nlohmann

// The Addition to the STD Namespace is required to add
// Structured Bindings Support to the iteration_proxy_value class
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
// And see https://github.com/nlohmann/json/pull/1391
namespace std
{
#if defined(__clang__)
    // Fix: https://github.com/nlohmann/json/issues/1401
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wmismatched-tags"
#endif
template<typename IteratorType>
class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
            : public std::integral_constant<std::size_t, 2> {};

template<std::size_t N, typename IteratorType>
class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
{
  public:
    using type = decltype(
                     get<N>(std::declval <
                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
};
#if defined(__clang__)
    #pragma clang diagnostic pop
#endif
} // namespace std

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
//////////////////
// constructors //
//////////////////

template<value_t> struct external_constructor;

template<>
struct external_constructor<value_t::boolean>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
    {
        j.m_type = value_t::boolean;
        j.m_value = b;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::string>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
    {
        j.m_type = value_t::string;
        j.m_value = s;
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
    {
        j.m_type = value_t::string;
        j.m_value = std::move(s);
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleStringType,
               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
                             int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleStringType& str)
    {
        j.m_type = value_t::string;
        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::binary>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
    {
        j.m_type = value_t::binary;
        j.m_value = typename BasicJsonType::binary_t(b);
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
    {
        j.m_type = value_t::binary;
        j.m_value = typename BasicJsonType::binary_t(std::move(b));;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_float>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
    {
        j.m_type = value_t::number_float;
        j.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_unsigned>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
    {
        j.m_type = value_t::number_unsigned;
        j.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::number_integer>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
    {
        j.m_type = value_t::number_integer;
        j.m_value = val;
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::array>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
    {
        j.m_type = value_t::array;
        j.m_value = arr;
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
    {
        j.m_type = value_t::array;
        j.m_value = std::move(arr);
        j.set_parents();
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleArrayType,
               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
                             int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
    {
        using std::begin;
        using std::end;
        j.m_type = value_t::array;
        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
    {
        j.m_type = value_t::array;
        j.m_value = value_t::array;
        j.m_value.array->reserve(arr.size());
        for (const bool x : arr)
        {
            j.m_value.array->push_back(x);
            j.set_parent(j.m_value.array->back());
        }
        j.assert_invariant();
    }

    template<typename BasicJsonType, typename T,
             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
    {
        j.m_type = value_t::array;
        j.m_value = value_t::array;
        j.m_value.array->resize(arr.size());
        if (arr.size() > 0)
        {
            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
        }
        j.set_parents();
        j.assert_invariant();
    }
};

template<>
struct external_constructor<value_t::object>
{
    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
    {
        j.m_type = value_t::object;
        j.m_value = obj;
        j.set_parents();
        j.assert_invariant();
    }

    template<typename BasicJsonType>
    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
    {
        j.m_type = value_t::object;
        j.m_value = std::move(obj);
        j.set_parents();
        j.assert_invariant();
    }

    template < typename BasicJsonType, typename CompatibleObjectType,
               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
    {
        using std::begin;
        using std::end;

        j.m_type = value_t::object;
        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
        j.set_parents();
        j.assert_invariant();
    }
};

/////////////
// to_json //
/////////////

template<typename BasicJsonType, typename T,
         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
void to_json(BasicJsonType& j, T b) noexcept
{
    external_constructor<value_t::boolean>::construct(j, b);
}

template<typename BasicJsonType, typename CompatibleString,
         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
void to_json(BasicJsonType& j, const CompatibleString& s)
{
    external_constructor<value_t::string>::construct(j, s);
}

template<typename BasicJsonType>
void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
{
    external_constructor<value_t::string>::construct(j, std::move(s));
}

template<typename BasicJsonType, typename FloatType,
         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
void to_json(BasicJsonType& j, FloatType val) noexcept
{
    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
}

template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
{
    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
}

template<typename BasicJsonType, typename CompatibleNumberIntegerType,
         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
{
    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
}

template<typename BasicJsonType, typename EnumType,
         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
void to_json(BasicJsonType& j, EnumType e) noexcept
{
    using underlying_type = typename std::underlying_type<EnumType>::type;
    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
}

template<typename BasicJsonType>
void to_json(BasicJsonType& j, const std::vector<bool>& e)
{
    external_constructor<value_t::array>::construct(j, e);
}

template < typename BasicJsonType, typename CompatibleArrayType,
           enable_if_t < is_compatible_array_type<BasicJsonType,
                         CompatibleArrayType>::value&&
                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
                         !is_basic_json<CompatibleArrayType>::value,
                         int > = 0 >
void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
{
    external_constructor<value_t::array>::construct(j, arr);
}

template<typename BasicJsonType>
void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
{
    external_constructor<value_t::binary>::construct(j, bin);
}

template<typename BasicJsonType, typename T,
         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
void to_json(BasicJsonType& j, const std::valarray<T>& arr)
{
    external_constructor<value_t::array>::construct(j, std::move(arr));
}

template<typename BasicJsonType>
void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
{
    external_constructor<value_t::array>::construct(j, std::move(arr));
}

template < typename BasicJsonType, typename CompatibleObjectType,
           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
{
    external_constructor<value_t::object>::construct(j, obj);
}

template<typename BasicJsonType>
void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
{
    external_constructor<value_t::object>::construct(j, std::move(obj));
}

template <
    typename BasicJsonType, typename T, std::size_t N,
    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
                  const T(&)[N]>::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
                  int > = 0 >
void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
{
    external_constructor<value_t::array>::construct(j, arr);
}

template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
{
    j = { p.first, p.second };
}

// for https://github.com/nlohmann/json/pull/1134
template<typename BasicJsonType, typename T,
         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
void to_json(BasicJsonType& j, const T& b)
{
    j = { {b.key(), b.value()} };
}

template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
{
    j = { std::get<Idx>(t)... };
}

template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
void to_json(BasicJsonType& j, const T& t)
{
    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
}

struct to_json_fn
{
    template<typename BasicJsonType, typename T>
    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
    -> decltype(to_json(j, std::forward<T>(val)), void())
    {
        return to_json(j, std::forward<T>(val));
    }
};
}  // namespace detail

/// namespace to hold default `to_json` function
/// to see why this is required:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
{
constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value; // NOLINT(misc-definitions-in-headers)
} // namespace
} // namespace nlohmann


namespace nlohmann
{

template<typename, typename>
struct adl_serializer
{
    /*!
    @brief convert a JSON value to any value type

    This function is usually called by the `get()` function of the
    @ref basic_json class (either explicit or via conversion operators).

    @param[in] j        JSON value to read from
    @param[in,out] val  value to write to
    */
    template<typename BasicJsonType, typename ValueType>
    static auto from_json(BasicJsonType&& j, ValueType& val) noexcept(
        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
    {
        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
    }

    /*!
    @brief convert any value type to a JSON value

    This function is usually called by the constructors of the @ref basic_json
    class.

    @param[in,out] j  JSON value to write to
    @param[in] val    value to read from
    */
    template<typename BasicJsonType, typename ValueType>
    static auto to_json(BasicJsonType& j, ValueType&& val) noexcept(
        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
    -> decltype(::nlohmann::to_json(j, std::forward<ValueType>(val)), void())
    {
        ::nlohmann::to_json(j, std::forward<ValueType>(val));
    }
};

}  // namespace nlohmann

// #include <nlohmann/byte_container_with_subtype.hpp>


#include <cstdint> // uint8_t
#include <tuple> // tie
#include <utility> // move

namespace nlohmann
{

/*!
@brief an internal type for a backed binary type

This type extends the template parameter @a BinaryType provided to `basic_json`
with a subtype used by BSON and MessagePack. This type exists so that the user
does not have to specify a type themselves with a specific naming scheme in
order to override the binary type.

@tparam BinaryType container to store bytes (`std::vector<std::uint8_t>` by
                   default)

@since version 3.8.0
*/
template<typename BinaryType>
class byte_container_with_subtype : public BinaryType
{
  public:
    /// the type of the underlying container
    using container_type = BinaryType;

    byte_container_with_subtype() noexcept(noexcept(container_type()))
        : container_type()
    {}

    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
        : container_type(b)
    {}

    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
        : container_type(std::move(b))
    {}

    byte_container_with_subtype(const container_type& b, std::uint8_t subtype_) noexcept(noexcept(container_type(b)))
        : container_type(b)
        , m_subtype(subtype_)
        , m_has_subtype(true)
    {}

    byte_container_with_subtype(container_type&& b, std::uint8_t subtype_) noexcept(noexcept(container_type(std::move(b))))
        : container_type(std::move(b))
        , m_subtype(subtype_)
        , m_has_subtype(true)
    {}

    bool operator==(const byte_container_with_subtype& rhs) const
    {
        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
    }

    bool operator!=(const byte_container_with_subtype& rhs) const
    {
        return !(rhs == *this);
    }

    /*!
    @brief sets the binary subtype

    Sets the binary subtype of the value, also flags a binary JSON value as
    having a subtype, which has implications for serialization.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @sa see @ref subtype() -- return the binary subtype
    @sa see @ref clear_subtype() -- clears the binary subtype
    @sa see @ref has_subtype() -- returns whether or not the binary value has a
    subtype

    @since version 3.8.0
    */
    void set_subtype(std::uint8_t subtype_) noexcept
    {
        m_subtype = subtype_;
        m_has_subtype = true;
    }

    /*!
    @brief return the binary subtype

    Returns the numerical subtype of the value if it has a subtype. If it does
    not have a subtype, this function will return size_t(-1) as a sentinel
    value.

    @return the numerical subtype of the binary value

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @sa see @ref set_subtype() -- sets the binary subtype
    @sa see @ref clear_subtype() -- clears the binary subtype
    @sa see @ref has_subtype() -- returns whether or not the binary value has a
    subtype

    @since version 3.8.0
    */
    constexpr std::uint8_t subtype() const noexcept
    {
        return m_subtype;
    }

    /*!
    @brief return whether the value has a subtype

    @return whether the value has a subtype

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @sa see @ref subtype() -- return the binary subtype
    @sa see @ref set_subtype() -- sets the binary subtype
    @sa see @ref clear_subtype() -- clears the binary subtype

    @since version 3.8.0
    */
    constexpr bool has_subtype() const noexcept
    {
        return m_has_subtype;
    }

    /*!
    @brief clears the binary subtype

    Clears the binary subtype and flags the value as not having a subtype, which
    has implications for serialization; for instance MessagePack will prefer the
    bin family over the ext family.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @sa see @ref subtype() -- return the binary subtype
    @sa see @ref set_subtype() -- sets the binary subtype
    @sa see @ref has_subtype() -- returns whether or not the binary value has a
    subtype

    @since version 3.8.0
    */
    void clear_subtype() noexcept
    {
        m_subtype = 0;
        m_has_subtype = false;
    }

  private:
    std::uint8_t m_subtype = 0;
    bool m_has_subtype = false;
};

}  // namespace nlohmann

// #include <nlohmann/detail/conversions/from_json.hpp>

// #include <nlohmann/detail/conversions/to_json.hpp>

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/hash.hpp>


#include <cstdint> // uint8_t
#include <cstddef> // size_t
#include <functional> // hash

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{

// boost::hash_combine
inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
{
    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
    return seed;
}

/*!
@brief hash a JSON value

The hash function tries to rely on std::hash where possible. Furthermore, the
type of the JSON value is taken into account to have different hash values for
null, 0, 0U, and false, etc.

@tparam BasicJsonType basic_json specialization
@param j JSON value to hash
@return hash value of j
*/
template<typename BasicJsonType>
std::size_t hash(const BasicJsonType& j)
{
    using string_t = typename BasicJsonType::string_t;
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;

    const auto type = static_cast<std::size_t>(j.type());
    switch (j.type())
    {
        case BasicJsonType::value_t::null:
        case BasicJsonType::value_t::discarded:
        {
            return combine(type, 0);
        }

        case BasicJsonType::value_t::object:
        {
            auto seed = combine(type, j.size());
            for (const auto& element : j.items())
            {
                const auto h = std::hash<string_t> {}(element.key());
                seed = combine(seed, h);
                seed = combine(seed, hash(element.value()));
            }
            return seed;
        }

        case BasicJsonType::value_t::array:
        {
            auto seed = combine(type, j.size());
            for (const auto& element : j)
            {
                seed = combine(seed, hash(element));
            }
            return seed;
        }

        case BasicJsonType::value_t::string:
        {
            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::boolean:
        {
            const auto h = std::hash<bool> {}(j.template get<bool>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_integer:
        {
            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_unsigned:
        {
            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::number_float:
        {
            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
            return combine(type, h);
        }

        case BasicJsonType::value_t::binary:
        {
            auto seed = combine(type, j.get_binary().size());
            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
            seed = combine(seed, h);
            seed = combine(seed, j.get_binary().subtype());
            for (const auto byte : j.get_binary())
            {
                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
            }
            return seed;
        }

        default:                   // LCOV_EXCL_LINE
            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            return 0;              // LCOV_EXCL_LINE
    }
}

}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/input/binary_reader.hpp>


#include <algorithm> // generate_n
#include <array> // array
#include <cmath> // ldexp
#include <cstddef> // size_t
#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
#include <cstdio> // snprintf
#include <cstring> // memcpy
#include <iterator> // back_inserter
#include <limits> // numeric_limits
#include <string> // char_traits, string
#include <utility> // make_pair, move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/input/input_adapters.hpp>


#include <array> // array
#include <cstddef> // size_t
#include <cstdio> //FILE *
#include <cstring> // strlen
#include <istream> // istream
#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
#include <memory> // shared_ptr, make_shared, addressof
#include <numeric> // accumulate
#include <string> // string, char_traits
#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
#include <utility> // pair, declval

// #include <nlohmann/detail/iterators/iterator_traits.hpp>

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{
/// the supported input formats
enum class input_format_t { json, cbor, msgpack, ubjson, bson };

////////////////////
// input adapters //
////////////////////

/*!
Input adapter for stdio file access. This adapter read only 1 byte and do not use any
 buffer. This adapter is a very low level adapter.
*/
class file_input_adapter
{
  public:
    using char_type = char;

    JSON_HEDLEY_NON_NULL(2)
    explicit file_input_adapter(std::FILE* f) noexcept
        : m_file(f)
    {}

    // make class move-only
    file_input_adapter(const file_input_adapter&) = delete;
    file_input_adapter(file_input_adapter&&) noexcept = default;
    file_input_adapter& operator=(const file_input_adapter&) = delete;
    file_input_adapter& operator=(file_input_adapter&&) = delete;
    ~file_input_adapter() = default;

    std::char_traits<char>::int_type get_character() noexcept
    {
        return std::fgetc(m_file);
    }

  private:
    /// the file pointer to read from
    std::FILE* m_file;
};


/*!
Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
beginning of input. Does not support changing the underlying std::streambuf
in mid-input. Maintains underlying std::istream and std::streambuf to support
subsequent use of standard std::istream operations to process any input
characters following those used in parsing the JSON input.  Clears the
std::istream flags; any input errors (e.g., EOF) will be detected by the first
subsequent call for input from the std::istream.
*/
class input_stream_adapter
{
  public:
    using char_type = char;

    ~input_stream_adapter()
    {
        // clear stream flags; we use underlying streambuf I/O, do not
        // maintain ifstream flags, except eof
        if (is != nullptr)
        {
            is->clear(is->rdstate() & std::ios::eofbit);
        }
    }

    explicit input_stream_adapter(std::istream& i)
        : is(&i), sb(i.rdbuf())
    {}

    // delete because of pointer members
    input_stream_adapter(const input_stream_adapter&) = delete;
    input_stream_adapter& operator=(input_stream_adapter&) = delete;
    input_stream_adapter& operator=(input_stream_adapter&&) = delete;

    input_stream_adapter(input_stream_adapter&& rhs) noexcept
        : is(rhs.is), sb(rhs.sb)
    {
        rhs.is = nullptr;
        rhs.sb = nullptr;
    }

    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
    // end up as the same value, eg. 0xFFFFFFFF.
    std::char_traits<char>::int_type get_character()
    {
        auto res = sb->sbumpc();
        // set eof manually, as we don't use the istream interface.
        if (JSON_HEDLEY_UNLIKELY(res == EOF))
        {
            is->clear(is->rdstate() | std::ios::eofbit);
        }
        return res;
    }

  private:
    /// the associated input stream
    std::istream* is = nullptr;
    std::streambuf* sb = nullptr;
};

// General-purpose iterator-based adapter. It might not be as fast as
// theoretically possible for some containers, but it is extremely versatile.
template<typename IteratorType>
class iterator_input_adapter
{
  public:
    using char_type = typename std::iterator_traits<IteratorType>::value_type;

    iterator_input_adapter(IteratorType first, IteratorType last)
        : current(std::move(first)), end(std::move(last))
    {}

    typename std::char_traits<char_type>::int_type get_character()
    {
        if (JSON_HEDLEY_LIKELY(current != end))
        {
            auto result = std::char_traits<char_type>::to_int_type(*current);
            std::advance(current, 1);
            return result;
        }

        return std::char_traits<char_type>::eof();
    }

  private:
    IteratorType current;
    IteratorType end;

    template<typename BaseInputAdapter, size_t T>
    friend struct wide_string_input_helper;

    bool empty() const
    {
        return current == end;
    }
};


template<typename BaseInputAdapter, size_t T>
struct wide_string_input_helper;

template<typename BaseInputAdapter>
struct wide_string_input_helper<BaseInputAdapter, 4>
{
    // UTF-32
    static void fill_buffer(BaseInputAdapter& input,
                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
                            size_t& utf8_bytes_index,
                            size_t& utf8_bytes_filled)
    {
        utf8_bytes_index = 0;

        if (JSON_HEDLEY_UNLIKELY(input.empty()))
        {
            utf8_bytes[0] = std::char_traits<char>::eof();
            utf8_bytes_filled = 1;
        }
        else
        {
            // get the current character
            const auto wc = input.get_character();

            // UTF-32 to UTF-8 encoding
            if (wc < 0x80)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
            else if (wc <= 0x7FF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 2;
            }
            else if (wc <= 0xFFFF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 3;
            }
            else if (wc <= 0x10FFFF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 4;
            }
            else
            {
                // unknown character
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
        }
    }
};

template<typename BaseInputAdapter>
struct wide_string_input_helper<BaseInputAdapter, 2>
{
    // UTF-16
    static void fill_buffer(BaseInputAdapter& input,
                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
                            size_t& utf8_bytes_index,
                            size_t& utf8_bytes_filled)
    {
        utf8_bytes_index = 0;

        if (JSON_HEDLEY_UNLIKELY(input.empty()))
        {
            utf8_bytes[0] = std::char_traits<char>::eof();
            utf8_bytes_filled = 1;
        }
        else
        {
            // get the current character
            const auto wc = input.get_character();

            // UTF-16 to UTF-8 encoding
            if (wc < 0x80)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                utf8_bytes_filled = 1;
            }
            else if (wc <= 0x7FF)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 2;
            }
            else if (0xD800 > wc || wc >= 0xE000)
            {
                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
                utf8_bytes_filled = 3;
            }
            else
            {
                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
                {
                    const auto wc2 = static_cast<unsigned int>(input.get_character());
                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
                    utf8_bytes_filled = 4;
                }
                else
                {
                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
                    utf8_bytes_filled = 1;
                }
            }
        }
    }
};

// Wraps another input apdater to convert wide character types into individual bytes.
template<typename BaseInputAdapter, typename WideCharType>
class wide_string_input_adapter
{
  public:
    using char_type = char;

    wide_string_input_adapter(BaseInputAdapter base)
        : base_adapter(base) {}

    typename std::char_traits<char>::int_type get_character() noexcept
    {
        // check if buffer needs to be filled
        if (utf8_bytes_index == utf8_bytes_filled)
        {
            fill_buffer<sizeof(WideCharType)>();

            JSON_ASSERT(utf8_bytes_filled > 0);
            JSON_ASSERT(utf8_bytes_index == 0);
        }

        // use buffer
        JSON_ASSERT(utf8_bytes_filled > 0);
        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
        return utf8_bytes[utf8_bytes_index++];
    }

  private:
    BaseInputAdapter base_adapter;

    template<size_t T>
    void fill_buffer()
    {
        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
    }

    /// a buffer for UTF-8 bytes
    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};

    /// index to the utf8_codes array for the next valid byte
    std::size_t utf8_bytes_index = 0;
    /// number of valid bytes in the utf8_codes array
    std::size_t utf8_bytes_filled = 0;
};


template<typename IteratorType, typename Enable = void>
struct iterator_input_adapter_factory
{
    using iterator_type = IteratorType;
    using char_type = typename std::iterator_traits<iterator_type>::value_type;
    using adapter_type = iterator_input_adapter<iterator_type>;

    static adapter_type create(IteratorType first, IteratorType last)
    {
        return adapter_type(std::move(first), std::move(last));
    }
};

template<typename T>
struct is_iterator_of_multibyte
{
    using value_type = typename std::iterator_traits<T>::value_type;
    enum
    {
        value = sizeof(value_type) > 1
    };
};

template<typename IteratorType>
struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
{
    using iterator_type = IteratorType;
    using char_type = typename std::iterator_traits<iterator_type>::value_type;
    using base_adapter_type = iterator_input_adapter<iterator_type>;
    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;

    static adapter_type create(IteratorType first, IteratorType last)
    {
        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
    }
};

// General purpose iterator-based input
template<typename IteratorType>
typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
{
    using factory_type = iterator_input_adapter_factory<IteratorType>;
    return factory_type::create(first, last);
}

// Convenience shorthand from container to iterator
// Enables ADL on begin(container) and end(container)
// Encloses the using declarations in namespace for not to leak them to outside scope

namespace container_input_adapter_factory_impl
{

using std::begin;
using std::end;

template<typename ContainerType, typename Enable = void>
struct container_input_adapter_factory {};

template<typename ContainerType>
struct container_input_adapter_factory< ContainerType,
       void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>>
       {
           using adapter_type = decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));

           static adapter_type create(const ContainerType& container)
{
    return input_adapter(begin(container), end(container));
}
       };

} // namespace container_input_adapter_factory_impl

template<typename ContainerType>
typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type input_adapter(const ContainerType& container)
{
    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
}

// Special cases with fast paths
inline file_input_adapter input_adapter(std::FILE* file)
{
    return file_input_adapter(file);
}

inline input_stream_adapter input_adapter(std::istream& stream)
{
    return input_stream_adapter(stream);
}

inline input_stream_adapter input_adapter(std::istream&& stream)
{
    return input_stream_adapter(stream);
}

using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));

// Null-delimited strings, and the like.
template < typename CharT,
           typename std::enable_if <
               std::is_pointer<CharT>::value&&
               !std::is_array<CharT>::value&&
               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
               sizeof(typename std::remove_pointer<CharT>::type) == 1,
               int >::type = 0 >
contiguous_bytes_input_adapter input_adapter(CharT b)
{
    auto length = std::strlen(reinterpret_cast<const char*>(b));
    const auto* ptr = reinterpret_cast<const char*>(b);
    return input_adapter(ptr, ptr + length);
}

template<typename T, std::size_t N>
auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
{
    return input_adapter(array, array + N);
}

// This class only handles inputs of input_buffer_adapter type.
// It's required so that expressions like {ptr, len} can be implicitely casted
// to the correct adapter.
class span_input_adapter
{
  public:
    template < typename CharT,
               typename std::enable_if <
                   std::is_pointer<CharT>::value&&
                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
                   int >::type = 0 >
    span_input_adapter(CharT b, std::size_t l)
        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}

    template<class IteratorType,
             typename std::enable_if<
                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
                 int>::type = 0>
    span_input_adapter(IteratorType first, IteratorType last)
        : ia(input_adapter(first, last)) {}

    contiguous_bytes_input_adapter&& get()
    {
        return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
    }

  private:
    contiguous_bytes_input_adapter ia;
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/input/json_sax.hpp>


#include <cstddef>
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{

/*!
@brief SAX interface

This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
Each function is called in different situations while the input is parsed. The
boolean return value informs the parser whether to continue processing the
input.
*/
template<typename BasicJsonType>
struct json_sax
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    /*!
    @brief a null value was read
    @return whether parsing should proceed
    */
    virtual bool null() = 0;

    /*!
    @brief a boolean value was read
    @param[in] val  boolean value
    @return whether parsing should proceed
    */
    virtual bool boolean(bool val) = 0;

    /*!
    @brief an integer number was read
    @param[in] val  integer value
    @return whether parsing should proceed
    */
    virtual bool number_integer(number_integer_t val) = 0;

    /*!
    @brief an unsigned integer number was read
    @param[in] val  unsigned integer value
    @return whether parsing should proceed
    */
    virtual bool number_unsigned(number_unsigned_t val) = 0;

    /*!
    @brief an floating-point number was read
    @param[in] val  floating-point value
    @param[in] s    raw token value
    @return whether parsing should proceed
    */
    virtual bool number_float(number_float_t val, const string_t& s) = 0;

    /*!
    @brief a string was read
    @param[in] val  string value
    @return whether parsing should proceed
    @note It is safe to move the passed string.
    */
    virtual bool string(string_t& val) = 0;

    /*!
    @brief a binary string was read
    @param[in] val  binary value
    @return whether parsing should proceed
    @note It is safe to move the passed binary.
    */
    virtual bool binary(binary_t& val) = 0;

    /*!
    @brief the beginning of an object was read
    @param[in] elements  number of object elements or -1 if unknown
    @return whether parsing should proceed
    @note binary formats may report the number of elements
    */
    virtual bool start_object(std::size_t elements) = 0;

    /*!
    @brief an object key was read
    @param[in] val  object key
    @return whether parsing should proceed
    @note It is safe to move the passed string.
    */
    virtual bool key(string_t& val) = 0;

    /*!
    @brief the end of an object was read
    @return whether parsing should proceed
    */
    virtual bool end_object() = 0;

    /*!
    @brief the beginning of an array was read
    @param[in] elements  number of array elements or -1 if unknown
    @return whether parsing should proceed
    @note binary formats may report the number of elements
    */
    virtual bool start_array(std::size_t elements) = 0;

    /*!
    @brief the end of an array was read
    @return whether parsing should proceed
    */
    virtual bool end_array() = 0;

    /*!
    @brief a parse error occurred
    @param[in] position    the position in the input where the error occurs
    @param[in] last_token  the last read token
    @param[in] ex          an exception object describing the error
    @return whether parsing should proceed (must return false)
    */
    virtual bool parse_error(std::size_t position,
                             const std::string& last_token,
                             const detail::exception& ex) = 0;

    json_sax() = default;
    json_sax(const json_sax&) = default;
    json_sax(json_sax&&) noexcept = default;
    json_sax& operator=(const json_sax&) = default;
    json_sax& operator=(json_sax&&) noexcept = default;
    virtual ~json_sax() = default;
};


namespace detail
{
/*!
@brief SAX implementation to create a JSON value from SAX events

This class implements the @ref json_sax interface and processes the SAX events
to create a JSON value which makes it basically a DOM parser. The structure or
hierarchy of the JSON value is managed by the stack `ref_stack` which contains
a pointer to the respective array or object for each recursion depth.

After successful parsing, the value that is passed by reference to the
constructor contains the parsed value.

@tparam BasicJsonType  the JSON type
*/
template<typename BasicJsonType>
class json_sax_dom_parser
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    /*!
    @param[in,out] r  reference to a JSON value that is manipulated while
                       parsing
    @param[in] allow_exceptions_  whether parse errors yield exceptions
    */
    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
        : root(r), allow_exceptions(allow_exceptions_)
    {}

    // make class move-only
    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~json_sax_dom_parser() = default;

    bool null()
    {
        handle_value(nullptr);
        return true;
    }

    bool boolean(bool val)
    {
        handle_value(val);
        return true;
    }

    bool number_integer(number_integer_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_unsigned(number_unsigned_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_float(number_float_t val, const string_t& /*unused*/)
    {
        handle_value(val);
        return true;
    }

    bool string(string_t& val)
    {
        handle_value(val);
        return true;
    }

    bool binary(binary_t& val)
    {
        handle_value(std::move(val));
        return true;
    }

    bool start_object(std::size_t len)
    {
        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));

        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back()));
        }

        return true;
    }

    bool key(string_t& val)
    {
        // add null at given key and store the reference for later
        object_element = &(ref_stack.back()->m_value.object->operator[](val));
        return true;
    }

    bool end_object()
    {
        ref_stack.back()->set_parents();
        ref_stack.pop_back();
        return true;
    }

    bool start_array(std::size_t len)
    {
        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));

        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back()));
        }

        return true;
    }

    bool end_array()
    {
        ref_stack.back()->set_parents();
        ref_stack.pop_back();
        return true;
    }

    template<class Exception>
    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
                     const Exception& ex)
    {
        errored = true;
        static_cast<void>(ex);
        if (allow_exceptions)
        {
            JSON_THROW(ex);
        }
        return false;
    }

    constexpr bool is_errored() const
    {
        return errored;
    }

  private:
    /*!
    @invariant If the ref stack is empty, then the passed value will be the new
               root.
    @invariant If the ref stack contains a value, then it is an array or an
               object to which we can add elements
    */
    template<typename Value>
    JSON_HEDLEY_RETURNS_NON_NULL
    BasicJsonType* handle_value(Value&& v)
    {
        if (ref_stack.empty())
        {
            root = BasicJsonType(std::forward<Value>(v));
            return &root;
        }

        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());

        if (ref_stack.back()->is_array())
        {
            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
            return &(ref_stack.back()->m_value.array->back());
        }

        JSON_ASSERT(ref_stack.back()->is_object());
        JSON_ASSERT(object_element);
        *object_element = BasicJsonType(std::forward<Value>(v));
        return object_element;
    }

    /// the parsed JSON value
    BasicJsonType& root;
    /// stack to model hierarchy of values
    std::vector<BasicJsonType*> ref_stack {};
    /// helper to hold the reference for the next object element
    BasicJsonType* object_element = nullptr;
    /// whether a syntax error occurred
    bool errored = false;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
};

template<typename BasicJsonType>
class json_sax_dom_callback_parser
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using parser_callback_t = typename BasicJsonType::parser_callback_t;
    using parse_event_t = typename BasicJsonType::parse_event_t;

    json_sax_dom_callback_parser(BasicJsonType& r,
                                 const parser_callback_t cb,
                                 const bool allow_exceptions_ = true)
        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
    {
        keep_stack.push_back(true);
    }

    // make class move-only
    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~json_sax_dom_callback_parser() = default;

    bool null()
    {
        handle_value(nullptr);
        return true;
    }

    bool boolean(bool val)
    {
        handle_value(val);
        return true;
    }

    bool number_integer(number_integer_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_unsigned(number_unsigned_t val)
    {
        handle_value(val);
        return true;
    }

    bool number_float(number_float_t val, const string_t& /*unused*/)
    {
        handle_value(val);
        return true;
    }

    bool string(string_t& val)
    {
        handle_value(val);
        return true;
    }

    bool binary(binary_t& val)
    {
        handle_value(std::move(val));
        return true;
    }

    bool start_object(std::size_t len)
    {
        // check callback for object start
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
        keep_stack.push_back(keep);

        auto val = handle_value(BasicJsonType::value_t::object, true);
        ref_stack.push_back(val.second);

        // check object limit
        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back()));
        }

        return true;
    }

    bool key(string_t& val)
    {
        BasicJsonType k = BasicJsonType(val);

        // check callback for key
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
        key_keep_stack.push_back(keep);

        // add discarded value at given key and store the reference for later
        if (keep && ref_stack.back())
        {
            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
        }

        return true;
    }

    bool end_object()
    {
        if (ref_stack.back())
        {
            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
            {
                // discard object
                *ref_stack.back() = discarded;
            }
            else
            {
                ref_stack.back()->set_parents();
            }
        }

        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(!keep_stack.empty());
        ref_stack.pop_back();
        keep_stack.pop_back();

        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
        {
            // remove discarded value
            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
            {
                if (it->is_discarded())
                {
                    ref_stack.back()->erase(it);
                    break;
                }
            }
        }

        return true;
    }

    bool start_array(std::size_t len)
    {
        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
        keep_stack.push_back(keep);

        auto val = handle_value(BasicJsonType::value_t::array, true);
        ref_stack.push_back(val.second);

        // check array limit
        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
        {
            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back()));
        }

        return true;
    }

    bool end_array()
    {
        bool keep = true;

        if (ref_stack.back())
        {
            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
            if (keep)
            {
                ref_stack.back()->set_parents();
            }
            else
            {
                // discard array
                *ref_stack.back() = discarded;
            }
        }

        JSON_ASSERT(!ref_stack.empty());
        JSON_ASSERT(!keep_stack.empty());
        ref_stack.pop_back();
        keep_stack.pop_back();

        // remove discarded value
        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
        {
            ref_stack.back()->m_value.array->pop_back();
        }

        return true;
    }

    template<class Exception>
    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
                     const Exception& ex)
    {
        errored = true;
        static_cast<void>(ex);
        if (allow_exceptions)
        {
            JSON_THROW(ex);
        }
        return false;
    }

    constexpr bool is_errored() const
    {
        return errored;
    }

  private:
    /*!
    @param[in] v  value to add to the JSON value we build during parsing
    @param[in] skip_callback  whether we should skip calling the callback
               function; this is required after start_array() and
               start_object() SAX events, because otherwise we would call the
               callback function with an empty array or object, respectively.

    @invariant If the ref stack is empty, then the passed value will be the new
               root.
    @invariant If the ref stack contains a value, then it is an array or an
               object to which we can add elements

    @return pair of boolean (whether value should be kept) and pointer (to the
            passed value in the ref_stack hierarchy; nullptr if not kept)
    */
    template<typename Value>
    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
    {
        JSON_ASSERT(!keep_stack.empty());

        // do not handle this value if we know it would be added to a discarded
        // container
        if (!keep_stack.back())
        {
            return {false, nullptr};
        }

        // create value
        auto value = BasicJsonType(std::forward<Value>(v));

        // check callback
        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);

        // do not handle this value if we just learnt it shall be discarded
        if (!keep)
        {
            return {false, nullptr};
        }

        if (ref_stack.empty())
        {
            root = std::move(value);
            return {true, &root};
        }

        // skip this value if we already decided to skip the parent
        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
        if (!ref_stack.back())
        {
            return {false, nullptr};
        }

        // we now only expect arrays and objects
        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());

        // array
        if (ref_stack.back()->is_array())
        {
            ref_stack.back()->m_value.array->emplace_back(std::move(value));
            return {true, &(ref_stack.back()->m_value.array->back())};
        }

        // object
        JSON_ASSERT(ref_stack.back()->is_object());
        // check if we should store an element for the current key
        JSON_ASSERT(!key_keep_stack.empty());
        const bool store_element = key_keep_stack.back();
        key_keep_stack.pop_back();

        if (!store_element)
        {
            return {false, nullptr};
        }

        JSON_ASSERT(object_element);
        *object_element = std::move(value);
        return {true, object_element};
    }

    /// the parsed JSON value
    BasicJsonType& root;
    /// stack to model hierarchy of values
    std::vector<BasicJsonType*> ref_stack {};
    /// stack to manage which values to keep
    std::vector<bool> keep_stack {};
    /// stack to manage which object keys to keep
    std::vector<bool> key_keep_stack {};
    /// helper to hold the reference for the next object element
    BasicJsonType* object_element = nullptr;
    /// whether a syntax error occurred
    bool errored = false;
    /// callback function
    const parser_callback_t callback = nullptr;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
    /// a discarded value for the callback
    BasicJsonType discarded = BasicJsonType::value_t::discarded;
};

template<typename BasicJsonType>
class json_sax_acceptor
{
  public:
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;

    bool null()
    {
        return true;
    }

    bool boolean(bool /*unused*/)
    {
        return true;
    }

    bool number_integer(number_integer_t /*unused*/)
    {
        return true;
    }

    bool number_unsigned(number_unsigned_t /*unused*/)
    {
        return true;
    }

    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
    {
        return true;
    }

    bool string(string_t& /*unused*/)
    {
        return true;
    }

    bool binary(binary_t& /*unused*/)
    {
        return true;
    }

    bool start_object(std::size_t /*unused*/ = std::size_t(-1))
    {
        return true;
    }

    bool key(string_t& /*unused*/)
    {
        return true;
    }

    bool end_object()
    {
        return true;
    }

    bool start_array(std::size_t /*unused*/ = std::size_t(-1))
    {
        return true;
    }

    bool end_array()
    {
        return true;
    }

    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
    {
        return false;
    }
};
}  // namespace detail

}  // namespace nlohmann

// #include <nlohmann/detail/input/lexer.hpp>


#include <array> // array
#include <clocale> // localeconv
#include <cstddef> // size_t
#include <cstdio> // snprintf
#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
#include <initializer_list> // initializer_list
#include <string> // char_traits, string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/position_t.hpp>

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{
///////////
// lexer //
///////////

template<typename BasicJsonType>
class lexer_base
{
  public:
    /// token types for the parser
    enum class token_type
    {
        uninitialized,    ///< indicating the scanner is uninitialized
        literal_true,     ///< the `true` literal
        literal_false,    ///< the `false` literal
        literal_null,     ///< the `null` literal
        value_string,     ///< a string -- use get_string() for actual value
        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
        value_float,      ///< an floating point number -- use get_number_float() for actual value
        begin_array,      ///< the character for array begin `[`
        begin_object,     ///< the character for object begin `{`
        end_array,        ///< the character for array end `]`
        end_object,       ///< the character for object end `}`
        name_separator,   ///< the name separator `:`
        value_separator,  ///< the value separator `,`
        parse_error,      ///< indicating a parse error
        end_of_input,     ///< indicating the end of the input buffer
        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
    };

    /// return name of values of type token_type (only used for errors)
    JSON_HEDLEY_RETURNS_NON_NULL
    JSON_HEDLEY_CONST
    static const char* token_type_name(const token_type t) noexcept
    {
        switch (t)
        {
            case token_type::uninitialized:
                return "<uninitialized>";
            case token_type::literal_true:
                return "true literal";
            case token_type::literal_false:
                return "false literal";
            case token_type::literal_null:
                return "null literal";
            case token_type::value_string:
                return "string literal";
            case token_type::value_unsigned:
            case token_type::value_integer:
            case token_type::value_float:
                return "number literal";
            case token_type::begin_array:
                return "'['";
            case token_type::begin_object:
                return "'{'";
            case token_type::end_array:
                return "']'";
            case token_type::end_object:
                return "'}'";
            case token_type::name_separator:
                return "':'";
            case token_type::value_separator:
                return "','";
            case token_type::parse_error:
                return "<parse error>";
            case token_type::end_of_input:
                return "end of input";
            case token_type::literal_or_value:
                return "'[', '{', or a literal";
            // LCOV_EXCL_START
            default: // catch non-enum values
                return "unknown token";
                // LCOV_EXCL_STOP
        }
    }
};
/*!
@brief lexical analysis

This class organizes the lexical analysis during JSON deserialization.
*/
template<typename BasicJsonType, typename InputAdapterType>
class lexer : public lexer_base<BasicJsonType>
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using char_type = typename InputAdapterType::char_type;
    using char_int_type = typename std::char_traits<char_type>::int_type;

  public:
    using token_type = typename lexer_base<BasicJsonType>::token_type;

    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
        : ia(std::move(adapter))
        , ignore_comments(ignore_comments_)
        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
    {}

    // delete because of pointer members
    lexer(const lexer&) = delete;
    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    lexer& operator=(lexer&) = delete;
    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~lexer() = default;

  private:
    /////////////////////
    // locales
    /////////////////////

    /// return the locale-dependent decimal point
    JSON_HEDLEY_PURE
    static char get_decimal_point() noexcept
    {
        const auto* loc = localeconv();
        JSON_ASSERT(loc != nullptr);
        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
    }

    /////////////////////
    // scan functions
    /////////////////////

    /*!
    @brief get codepoint from 4 hex characters following `\u`

    For input "\u c1 c2 c3 c4" the codepoint is:
      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)

    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
    between the ASCII value of the character and the desired integer value.

    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
            non-hex character)
    */
    int get_codepoint()
    {
        // this function only makes sense after reading `\u`
        JSON_ASSERT(current == 'u');
        int codepoint = 0;

        const auto factors = { 12u, 8u, 4u, 0u };
        for (const auto factor : factors)
        {
            get();

            if (current >= '0' && current <= '9')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
            }
            else if (current >= 'A' && current <= 'F')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
            }
            else if (current >= 'a' && current <= 'f')
            {
                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
            }
            else
            {
                return -1;
            }
        }

        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
        return codepoint;
    }

    /*!
    @brief check if the next byte(s) are inside a given range

    Adds the current byte and, for each passed range, reads a new byte and
    checks if it is inside the range. If a violation was detected, set up an
    error message and return false. Otherwise, return true.

    @param[in] ranges  list of integers; interpreted as list of pairs of
                       inclusive lower and upper bound, respectively

    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
         1, 2, or 3 pairs. This precondition is enforced by an assertion.

    @return true if and only if no range violation was detected
    */
    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
    {
        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
        add(current);

        for (auto range = ranges.begin(); range != ranges.end(); ++range)
        {
            get();
            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
            {
                add(current);
            }
            else
            {
                error_message = "invalid string: ill-formed UTF-8 byte";
                return false;
            }
        }

        return true;
    }

    /*!
    @brief scan a string literal

    This function scans a string according to Sect. 7 of RFC 7159. While
    scanning, bytes are escaped and copied into buffer token_buffer. Then the
    function returns successfully, token_buffer is *not* null-terminated (as it
    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
    string.

    @return token_type::value_string if string could be successfully scanned,
            token_type::parse_error otherwise

    @note In case of errors, variable error_message contains a textual
          description.
    */
    token_type scan_string()
    {
        // reset token_buffer (ignore opening quote)
        reset();

        // we entered the function by reading an open quote
        JSON_ASSERT(current == '\"');

        while (true)
        {
            // get next character
            switch (get())
            {
                // end of file while parsing string
                case std::char_traits<char_type>::eof():
                {
                    error_message = "invalid string: missing closing quote";
                    return token_type::parse_error;
                }

                // closing quote
                case '\"':
                {
                    return token_type::value_string;
                }

                // escapes
                case '\\':
                {
                    switch (get())
                    {
                        // quotation mark
                        case '\"':
                            add('\"');
                            break;
                        // reverse solidus
                        case '\\':
                            add('\\');
                            break;
                        // solidus
                        case '/':
                            add('/');
                            break;
                        // backspace
                        case 'b':
                            add('\b');
                            break;
                        // form feed
                        case 'f':
                            add('\f');
                            break;
                        // line feed
                        case 'n':
                            add('\n');
                            break;
                        // carriage return
                        case 'r':
                            add('\r');
                            break;
                        // tab
                        case 't':
                            add('\t');
                            break;

                        // unicode escapes
                        case 'u':
                        {
                            const int codepoint1 = get_codepoint();
                            int codepoint = codepoint1; // start with codepoint1

                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
                            {
                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
                                return token_type::parse_error;
                            }

                            // check if code point is a high surrogate
                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
                            {
                                // expect next \uxxxx entry
                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
                                {
                                    const int codepoint2 = get_codepoint();

                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
                                    {
                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
                                        return token_type::parse_error;
                                    }

                                    // check if codepoint2 is a low surrogate
                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
                                    {
                                        // overwrite codepoint
                                        codepoint = static_cast<int>(
                                                        // high surrogate occupies the most significant 22 bits
                                                        (static_cast<unsigned int>(codepoint1) << 10u)
                                                        // low surrogate occupies the least significant 15 bits
                                                        + static_cast<unsigned int>(codepoint2)
                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
                                                        // in the result so we have to subtract with:
                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
                                                        - 0x35FDC00u);
                                    }
                                    else
                                    {
                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
                                        return token_type::parse_error;
                                    }
                                }
                                else
                                {
                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
                                    return token_type::parse_error;
                                }
                            }
                            else
                            {
                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
                                {
                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
                                    return token_type::parse_error;
                                }
                            }

                            // result of the above calculation yields a proper codepoint
                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);

                            // translate codepoint into bytes
                            if (codepoint < 0x80)
                            {
                                // 1-byte characters: 0xxxxxxx (ASCII)
                                add(static_cast<char_int_type>(codepoint));
                            }
                            else if (codepoint <= 0x7FF)
                            {
                                // 2-byte characters: 110xxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }
                            else if (codepoint <= 0xFFFF)
                            {
                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }
                            else
                            {
                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
                            }

                            break;
                        }

                        // other characters after escape
                        default:
                            error_message = "invalid string: forbidden character after backslash";
                            return token_type::parse_error;
                    }

                    break;
                }

                // invalid control characters
                case 0x00:
                {
                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
                    return token_type::parse_error;
                }

                case 0x01:
                {
                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
                    return token_type::parse_error;
                }

                case 0x02:
                {
                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
                    return token_type::parse_error;
                }

                case 0x03:
                {
                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
                    return token_type::parse_error;
                }

                case 0x04:
                {
                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
                    return token_type::parse_error;
                }

                case 0x05:
                {
                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
                    return token_type::parse_error;
                }

                case 0x06:
                {
                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
                    return token_type::parse_error;
                }

                case 0x07:
                {
                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
                    return token_type::parse_error;
                }

                case 0x08:
                {
                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
                    return token_type::parse_error;
                }

                case 0x09:
                {
                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
                    return token_type::parse_error;
                }

                case 0x0A:
                {
                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
                    return token_type::parse_error;
                }

                case 0x0B:
                {
                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
                    return token_type::parse_error;
                }

                case 0x0C:
                {
                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
                    return token_type::parse_error;
                }

                case 0x0D:
                {
                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
                    return token_type::parse_error;
                }

                case 0x0E:
                {
                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
                    return token_type::parse_error;
                }

                case 0x0F:
                {
                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
                    return token_type::parse_error;
                }

                case 0x10:
                {
                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
                    return token_type::parse_error;
                }

                case 0x11:
                {
                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
                    return token_type::parse_error;
                }

                case 0x12:
                {
                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
                    return token_type::parse_error;
                }

                case 0x13:
                {
                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
                    return token_type::parse_error;
                }

                case 0x14:
                {
                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
                    return token_type::parse_error;
                }

                case 0x15:
                {
                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
                    return token_type::parse_error;
                }

                case 0x16:
                {
                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
                    return token_type::parse_error;
                }

                case 0x17:
                {
                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
                    return token_type::parse_error;
                }

                case 0x18:
                {
                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
                    return token_type::parse_error;
                }

                case 0x19:
                {
                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
                    return token_type::parse_error;
                }

                case 0x1A:
                {
                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
                    return token_type::parse_error;
                }

                case 0x1B:
                {
                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
                    return token_type::parse_error;
                }

                case 0x1C:
                {
                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
                    return token_type::parse_error;
                }

                case 0x1D:
                {
                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
                    return token_type::parse_error;
                }

                case 0x1E:
                {
                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
                    return token_type::parse_error;
                }

                case 0x1F:
                {
                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
                    return token_type::parse_error;
                }

                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
                case 0x20:
                case 0x21:
                case 0x23:
                case 0x24:
                case 0x25:
                case 0x26:
                case 0x27:
                case 0x28:
                case 0x29:
                case 0x2A:
                case 0x2B:
                case 0x2C:
                case 0x2D:
                case 0x2E:
                case 0x2F:
                case 0x30:
                case 0x31:
                case 0x32:
                case 0x33:
                case 0x34:
                case 0x35:
                case 0x36:
                case 0x37:
                case 0x38:
                case 0x39:
                case 0x3A:
                case 0x3B:
                case 0x3C:
                case 0x3D:
                case 0x3E:
                case 0x3F:
                case 0x40:
                case 0x41:
                case 0x42:
                case 0x43:
                case 0x44:
                case 0x45:
                case 0x46:
                case 0x47:
                case 0x48:
                case 0x49:
                case 0x4A:
                case 0x4B:
                case 0x4C:
                case 0x4D:
                case 0x4E:
                case 0x4F:
                case 0x50:
                case 0x51:
                case 0x52:
                case 0x53:
                case 0x54:
                case 0x55:
                case 0x56:
                case 0x57:
                case 0x58:
                case 0x59:
                case 0x5A:
                case 0x5B:
                case 0x5D:
                case 0x5E:
                case 0x5F:
                case 0x60:
                case 0x61:
                case 0x62:
                case 0x63:
                case 0x64:
                case 0x65:
                case 0x66:
                case 0x67:
                case 0x68:
                case 0x69:
                case 0x6A:
                case 0x6B:
                case 0x6C:
                case 0x6D:
                case 0x6E:
                case 0x6F:
                case 0x70:
                case 0x71:
                case 0x72:
                case 0x73:
                case 0x74:
                case 0x75:
                case 0x76:
                case 0x77:
                case 0x78:
                case 0x79:
                case 0x7A:
                case 0x7B:
                case 0x7C:
                case 0x7D:
                case 0x7E:
                case 0x7F:
                {
                    add(current);
                    break;
                }

                // U+0080..U+07FF: bytes C2..DF 80..BF
                case 0xC2:
                case 0xC3:
                case 0xC4:
                case 0xC5:
                case 0xC6:
                case 0xC7:
                case 0xC8:
                case 0xC9:
                case 0xCA:
                case 0xCB:
                case 0xCC:
                case 0xCD:
                case 0xCE:
                case 0xCF:
                case 0xD0:
                case 0xD1:
                case 0xD2:
                case 0xD3:
                case 0xD4:
                case 0xD5:
                case 0xD6:
                case 0xD7:
                case 0xD8:
                case 0xD9:
                case 0xDA:
                case 0xDB:
                case 0xDC:
                case 0xDD:
                case 0xDE:
                case 0xDF:
                {
                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
                case 0xE0:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
                case 0xE1:
                case 0xE2:
                case 0xE3:
                case 0xE4:
                case 0xE5:
                case 0xE6:
                case 0xE7:
                case 0xE8:
                case 0xE9:
                case 0xEA:
                case 0xEB:
                case 0xEC:
                case 0xEE:
                case 0xEF:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
                case 0xED:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
                case 0xF0:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
                case 0xF1:
                case 0xF2:
                case 0xF3:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
                case 0xF4:
                {
                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
                    {
                        return token_type::parse_error;
                    }
                    break;
                }

                // remaining bytes (80..C1 and F5..FF) are ill-formed
                default:
                {
                    error_message = "invalid string: ill-formed UTF-8 byte";
                    return token_type::parse_error;
                }
            }
        }
    }

    /*!
     * @brief scan a comment
     * @return whether comment could be scanned successfully
     */
    bool scan_comment()
    {
        switch (get())
        {
            // single-line comments skip input until a newline or EOF is read
            case '/':
            {
                while (true)
                {
                    switch (get())
                    {
                        case '\n':
                        case '\r':
                        case std::char_traits<char_type>::eof():
                        case '\0':
                            return true;

                        default:
                            break;
                    }
                }
            }

            // multi-line comments skip input until */ is read
            case '*':
            {
                while (true)
                {
                    switch (get())
                    {
                        case std::char_traits<char_type>::eof():
                        case '\0':
                        {
                            error_message = "invalid comment; missing closing '*/'";
                            return false;
                        }

                        case '*':
                        {
                            switch (get())
                            {
                                case '/':
                                    return true;

                                default:
                                {
                                    unget();
                                    continue;
                                }
                            }
                        }

                        default:
                            continue;
                    }
                }
            }

            // unexpected character after reading '/'
            default:
            {
                error_message = "invalid comment; expecting '/' or '*' after '/'";
                return false;
            }
        }
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(float& f, const char* str, char** endptr) noexcept
    {
        f = std::strtof(str, endptr);
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(double& f, const char* str, char** endptr) noexcept
    {
        f = std::strtod(str, endptr);
    }

    JSON_HEDLEY_NON_NULL(2)
    static void strtof(long double& f, const char* str, char** endptr) noexcept
    {
        f = std::strtold(str, endptr);
    }

    /*!
    @brief scan a number literal

    This function scans a string according to Sect. 6 of RFC 7159.

    The function is realized with a deterministic finite state machine derived
    from the grammar described in RFC 7159. Starting in state "init", the
    input is read and used to determined the next state. Only state "done"
    accepts the number. State "error" is a trap state to model errors. In the
    table below, "anything" means any character but the ones listed before.

    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
    ---------|----------|----------|----------|---------|---------|----------|-----------
    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
    zero     | done     | done     | exponent | done    | done    | decimal1 | done
    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
    any2     | any2     | any2     | done     | done    | done    | done     | done

    The state machine is realized with one label per state (prefixed with
    "scan_number_") and `goto` statements between them. The state machine
    contains cycles, but any cycle can be left when EOF is read. Therefore,
    the function is guaranteed to terminate.

    During scanning, the read bytes are stored in token_buffer. This string is
    then converted to a signed integer, an unsigned integer, or a
    floating-point number.

    @return token_type::value_unsigned, token_type::value_integer, or
            token_type::value_float if number could be successfully scanned,
            token_type::parse_error otherwise

    @note The scanner is independent of the current locale. Internally, the
          locale's decimal point is used instead of `.` to work with the
          locale-dependent converters.
    */
    token_type scan_number()  // lgtm [cpp/use-of-goto]
    {
        // reset token_buffer to store the number's bytes
        reset();

        // the type of the parsed number; initially set to unsigned; will be
        // changed if minus sign, decimal point or exponent is read
        token_type number_type = token_type::value_unsigned;

        // state (init): we just found out we need to scan a number
        switch (current)
        {
            case '-':
            {
                add(current);
                goto scan_number_minus;
            }

            case '0':
            {
                add(current);
                goto scan_number_zero;
            }

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            // all other characters are rejected outside scan_number()
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

scan_number_minus:
        // state: we just parsed a leading minus sign
        number_type = token_type::value_integer;
        switch (get())
        {
            case '0':
            {
                add(current);
                goto scan_number_zero;
            }

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            default:
            {
                error_message = "invalid number; expected digit after '-'";
                return token_type::parse_error;
            }
        }

scan_number_zero:
        // state: we just parse a zero (maybe with a leading minus sign)
        switch (get())
        {
            case '.':
            {
                add(decimal_point_char);
                goto scan_number_decimal1;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_any1:
        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any1;
            }

            case '.':
            {
                add(decimal_point_char);
                goto scan_number_decimal1;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_decimal1:
        // state: we just parsed a decimal point
        number_type = token_type::value_float;
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_decimal2;
            }

            default:
            {
                error_message = "invalid number; expected digit after '.'";
                return token_type::parse_error;
            }
        }

scan_number_decimal2:
        // we just parsed at least one number after a decimal point
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_decimal2;
            }

            case 'e':
            case 'E':
            {
                add(current);
                goto scan_number_exponent;
            }

            default:
                goto scan_number_done;
        }

scan_number_exponent:
        // we just parsed an exponent
        number_type = token_type::value_float;
        switch (get())
        {
            case '+':
            case '-':
            {
                add(current);
                goto scan_number_sign;
            }

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
            {
                error_message =
                    "invalid number; expected '+', '-', or digit after exponent";
                return token_type::parse_error;
            }
        }

scan_number_sign:
        // we just parsed an exponent sign
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
            {
                error_message = "invalid number; expected digit after exponent sign";
                return token_type::parse_error;
            }
        }

scan_number_any2:
        // we just parsed a number after the exponent or exponent sign
        switch (get())
        {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                add(current);
                goto scan_number_any2;
            }

            default:
                goto scan_number_done;
        }

scan_number_done:
        // unget the character after the number (we only read it to know that
        // we are done scanning a number)
        unget();

        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        errno = 0;

        // try to parse integers first and fall back to floats
        if (number_type == token_type::value_unsigned)
        {
            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);

            // we checked the number format before
            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

            if (errno == 0)
            {
                value_unsigned = static_cast<number_unsigned_t>(x);
                if (value_unsigned == x)
                {
                    return token_type::value_unsigned;
                }
            }
        }
        else if (number_type == token_type::value_integer)
        {
            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);

            // we checked the number format before
            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

            if (errno == 0)
            {
                value_integer = static_cast<number_integer_t>(x);
                if (value_integer == x)
                {
                    return token_type::value_integer;
                }
            }
        }

        // this code is reached if we parse a floating-point number or if an
        // integer conversion above failed
        strtof(value_float, token_buffer.data(), &endptr);

        // we checked the number format before
        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());

        return token_type::value_float;
    }

    /*!
    @param[in] literal_text  the literal text to expect
    @param[in] length        the length of the passed literal text
    @param[in] return_type   the token type to return on success
    */
    JSON_HEDLEY_NON_NULL(2)
    token_type scan_literal(const char_type* literal_text, const std::size_t length,
                            token_type return_type)
    {
        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
        for (std::size_t i = 1; i < length; ++i)
        {
            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
            {
                error_message = "invalid literal";
                return token_type::parse_error;
            }
        }
        return return_type;
    }

    /////////////////////
    // input management
    /////////////////////

    /// reset token_buffer; current character is beginning of token
    void reset() noexcept
    {
        token_buffer.clear();
        token_string.clear();
        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
    }

    /*
    @brief get next character from the input

    This function provides the interface to the used input adapter. It does
    not throw in case the input reached EOF, but returns a
    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
    for use in error messages.

    @return character read from the input
    */
    char_int_type get()
    {
        ++position.chars_read_total;
        ++position.chars_read_current_line;

        if (next_unget)
        {
            // just reset the next_unget variable and work with current
            next_unget = false;
        }
        else
        {
            current = ia.get_character();
        }

        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
        {
            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
        }

        if (current == '\n')
        {
            ++position.lines_read;
            position.chars_read_current_line = 0;
        }

        return current;
    }

    /*!
    @brief unget current character (read it again on next get)

    We implement unget by setting variable next_unget to true. The input is not
    changed - we just simulate ungetting by modifying chars_read_total,
    chars_read_current_line, and token_string. The next call to get() will
    behave as if the unget character is read again.
    */
    void unget()
    {
        next_unget = true;

        --position.chars_read_total;

        // in case we "unget" a newline, we have to also decrement the lines_read
        if (position.chars_read_current_line == 0)
        {
            if (position.lines_read > 0)
            {
                --position.lines_read;
            }
        }
        else
        {
            --position.chars_read_current_line;
        }

        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
        {
            JSON_ASSERT(!token_string.empty());
            token_string.pop_back();
        }
    }

    /// add a character to token_buffer
    void add(char_int_type c)
    {
        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
    }

  public:
    /////////////////////
    // value getters
    /////////////////////

    /// return integer value
    constexpr number_integer_t get_number_integer() const noexcept
    {
        return value_integer;
    }

    /// return unsigned integer value
    constexpr number_unsigned_t get_number_unsigned() const noexcept
    {
        return value_unsigned;
    }

    /// return floating-point value
    constexpr number_float_t get_number_float() const noexcept
    {
        return value_float;
    }

    /// return current string value (implicitly resets the token; useful only once)
    string_t& get_string()
    {
        return token_buffer;
    }

    /////////////////////
    // diagnostics
    /////////////////////

    /// return position of last read token
    constexpr position_t get_position() const noexcept
    {
        return position;
    }

    /// return the last read token (for errors only).  Will never contain EOF
    /// (an arbitrary value that is not a valid char value, often -1), because
    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
    std::string get_token_string() const
    {
        // escape control characters
        std::string result;
        for (const auto c : token_string)
        {
            if (static_cast<unsigned char>(c) <= '\x1F')
            {
                // escape control characters
                std::array<char, 9> cs{{}};
                (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                result += cs.data();
            }
            else
            {
                // add character as is
                result.push_back(static_cast<std::string::value_type>(c));
            }
        }

        return result;
    }

    /// return syntax error message
    JSON_HEDLEY_RETURNS_NON_NULL
    constexpr const char* get_error_message() const noexcept
    {
        return error_message;
    }

    /////////////////////
    // actual scanner
    /////////////////////

    /*!
    @brief skip the UTF-8 byte order mark
    @return true iff there is no BOM or the correct BOM has been skipped
    */
    bool skip_bom()
    {
        if (get() == 0xEF)
        {
            // check if we completely parse the BOM
            return get() == 0xBB && get() == 0xBF;
        }

        // the first character is not the beginning of the BOM; unget it to
        // process is later
        unget();
        return true;
    }

    void skip_whitespace()
    {
        do
        {
            get();
        }
        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
    }

    token_type scan()
    {
        // initially, skip the BOM
        if (position.chars_read_total == 0 && !skip_bom())
        {
            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
            return token_type::parse_error;
        }

        // read next character and ignore whitespace
        skip_whitespace();

        // ignore comments
        while (ignore_comments && current == '/')
        {
            if (!scan_comment())
            {
                return token_type::parse_error;
            }

            // skip following whitespace
            skip_whitespace();
        }

        switch (current)
        {
            // structural characters
            case '[':
                return token_type::begin_array;
            case ']':
                return token_type::end_array;
            case '{':
                return token_type::begin_object;
            case '}':
                return token_type::end_object;
            case ':':
                return token_type::name_separator;
            case ',':
                return token_type::value_separator;

            // literals
            case 't':
            {
                std::array<char_type, 4> true_literal = {{char_type('t'), char_type('r'), char_type('u'), char_type('e')}};
                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
            }
            case 'f':
            {
                std::array<char_type, 5> false_literal = {{char_type('f'), char_type('a'), char_type('l'), char_type('s'), char_type('e')}};
                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
            }
            case 'n':
            {
                std::array<char_type, 4> null_literal = {{char_type('n'), char_type('u'), char_type('l'), char_type('l')}};
                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
            }

            // string
            case '\"':
                return scan_string();

            // number
            case '-':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                return scan_number();

            // end of input (the null byte is needed when parsing from
            // string literals)
            case '\0':
            case std::char_traits<char_type>::eof():
                return token_type::end_of_input;

            // error
            default:
                error_message = "invalid literal";
                return token_type::parse_error;
        }
    }

  private:
    /// input adapter
    InputAdapterType ia;

    /// whether comments should be ignored (true) or signaled as errors (false)
    const bool ignore_comments = false;

    /// the current character
    char_int_type current = std::char_traits<char_type>::eof();

    /// whether the next get() call should just return current
    bool next_unget = false;

    /// the start position of the current token
    position_t position {};

    /// raw input token string (for error messages)
    std::vector<char_type> token_string {};

    /// buffer for variable-length tokens (numbers, strings)
    string_t token_buffer {};

    /// a description of occurred lexer errors
    const char* error_message = "";

    // number values
    number_integer_t value_integer = 0;
    number_unsigned_t value_unsigned = 0;
    number_float_t value_float = 0;

    /// the decimal point
    const char_int_type decimal_point_char = '.';
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/is_sax.hpp>


#include <cstdint> // size_t
#include <utility> // declval
#include <string> // string

// #include <nlohmann/detail/meta/detected.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>


namespace nlohmann
{
namespace detail
{
template<typename T>
using null_function_t = decltype(std::declval<T&>().null());

template<typename T>
using boolean_function_t =
    decltype(std::declval<T&>().boolean(std::declval<bool>()));

template<typename T, typename Integer>
using number_integer_function_t =
    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));

template<typename T, typename Unsigned>
using number_unsigned_function_t =
    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));

template<typename T, typename Float, typename String>
using number_float_function_t = decltype(std::declval<T&>().number_float(
                                    std::declval<Float>(), std::declval<const String&>()));

template<typename T, typename String>
using string_function_t =
    decltype(std::declval<T&>().string(std::declval<String&>()));

template<typename T, typename Binary>
using binary_function_t =
    decltype(std::declval<T&>().binary(std::declval<Binary&>()));

template<typename T>
using start_object_function_t =
    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));

template<typename T, typename String>
using key_function_t =
    decltype(std::declval<T&>().key(std::declval<String&>()));

template<typename T>
using end_object_function_t = decltype(std::declval<T&>().end_object());

template<typename T>
using start_array_function_t =
    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));

template<typename T>
using end_array_function_t = decltype(std::declval<T&>().end_array());

template<typename T, typename Exception>
using parse_error_function_t = decltype(std::declval<T&>().parse_error(
        std::declval<std::size_t>(), std::declval<const std::string&>(),
        std::declval<const Exception&>()));

template<typename SAX, typename BasicJsonType>
struct is_sax
{
  private:
    static_assert(is_basic_json<BasicJsonType>::value,
                  "BasicJsonType must be of type basic_json<...>");

    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using exception_t = typename BasicJsonType::exception;

  public:
    static constexpr bool value =
        is_detected_exact<bool, null_function_t, SAX>::value &&
        is_detected_exact<bool, boolean_function_t, SAX>::value &&
        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
        is_detected_exact<bool, start_object_function_t, SAX>::value &&
        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
        is_detected_exact<bool, end_object_function_t, SAX>::value &&
        is_detected_exact<bool, start_array_function_t, SAX>::value &&
        is_detected_exact<bool, end_array_function_t, SAX>::value &&
        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
};

template<typename SAX, typename BasicJsonType>
struct is_sax_static_asserts
{
  private:
    static_assert(is_basic_json<BasicJsonType>::value,
                  "BasicJsonType must be of type basic_json<...>");

    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using exception_t = typename BasicJsonType::exception;

  public:
    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
                  "Missing/invalid function: bool null()");
    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
                  "Missing/invalid function: bool boolean(bool)");
    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
                  "Missing/invalid function: bool boolean(bool)");
    static_assert(
        is_detected_exact<bool, number_integer_function_t, SAX,
        number_integer_t>::value,
        "Missing/invalid function: bool number_integer(number_integer_t)");
    static_assert(
        is_detected_exact<bool, number_unsigned_function_t, SAX,
        number_unsigned_t>::value,
        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
                  number_float_t, string_t>::value,
                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
    static_assert(
        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
        "Missing/invalid function: bool string(string_t&)");
    static_assert(
        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
        "Missing/invalid function: bool binary(binary_t&)");
    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
                  "Missing/invalid function: bool start_object(std::size_t)");
    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
                  "Missing/invalid function: bool key(string_t&)");
    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
                  "Missing/invalid function: bool end_object()");
    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
                  "Missing/invalid function: bool start_array(std::size_t)");
    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
                  "Missing/invalid function: bool end_array()");
    static_assert(
        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
        "Missing/invalid function: bool parse_error(std::size_t, const "
        "std::string&, const exception&)");
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{

/// how to treat CBOR tags
enum class cbor_tag_handler_t
{
    error,  ///< throw a parse_error exception in case of a tag
    ignore   ///< ignore tags
};

/*!
@brief determine system byte order

@return true if and only if system's byte order is little endian

@note from https://stackoverflow.com/a/1001328/266378
*/
static inline bool little_endianess(int num = 1) noexcept
{
    return *reinterpret_cast<char*>(&num) == 1;
}


///////////////////
// binary reader //
///////////////////

/*!
@brief deserialization of CBOR, MessagePack, and UBJSON values
*/
template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
class binary_reader
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using json_sax_t = SAX;
    using char_type = typename InputAdapterType::char_type;
    using char_int_type = typename std::char_traits<char_type>::int_type;

  public:
    /*!
    @brief create a binary reader

    @param[in] adapter  input adapter to read from
    */
    explicit binary_reader(InputAdapterType&& adapter) noexcept : ia(std::move(adapter))
    {
        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
    }

    // make class move-only
    binary_reader(const binary_reader&) = delete;
    binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    binary_reader& operator=(const binary_reader&) = delete;
    binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
    ~binary_reader() = default;

    /*!
    @param[in] format  the binary format to parse
    @param[in] sax_    a SAX event processor
    @param[in] strict  whether to expect the input to be consumed completed
    @param[in] tag_handler  how to treat CBOR tags

    @return whether parsing was successful
    */
    JSON_HEDLEY_NON_NULL(3)
    bool sax_parse(const input_format_t format,
                   json_sax_t* sax_,
                   const bool strict = true,
                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        sax = sax_;
        bool result = false;

        switch (format)
        {
            case input_format_t::bson:
                result = parse_bson_internal();
                break;

            case input_format_t::cbor:
                result = parse_cbor_internal(true, tag_handler);
                break;

            case input_format_t::msgpack:
                result = parse_msgpack_internal();
                break;

            case input_format_t::ubjson:
                result = parse_ubjson_internal();
                break;

            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

        // strict mode: next byte must be EOF
        if (result && strict)
        {
            if (format == input_format_t::ubjson)
            {
                get_ignore_noop();
            }
            else
            {
                get();
            }

            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
            {
                return sax->parse_error(chars_read, get_token_string(),
                                        parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value"), BasicJsonType()));
            }
        }

        return result;
    }

  private:
    //////////
    // BSON //
    //////////

    /*!
    @brief Reads in a BSON-object and passes it to the SAX-parser.
    @return whether a valid BSON-value was passed to the SAX parser
    */
    bool parse_bson_internal()
    {
        std::int32_t document_size{};
        get_number<std::int32_t, true>(input_format_t::bson, document_size);

        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
        {
            return false;
        }

        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
        {
            return false;
        }

        return sax->end_object();
    }

    /*!
    @brief Parses a C-style string from the BSON input.
    @param[in,out] result  A reference to the string variable where the read
                            string is to be stored.
    @return `true` if the \x00-byte indicating the end of the string was
             encountered before the EOF; false` indicates an unexpected EOF.
    */
    bool get_bson_cstr(string_t& result)
    {
        auto out = std::back_inserter(result);
        while (true)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
            {
                return false;
            }
            if (current == 0x00)
            {
                return true;
            }
            *out++ = static_cast<typename string_t::value_type>(current);
        }
    }

    /*!
    @brief Parses a zero-terminated string of length @a len from the BSON
           input.
    @param[in] len  The length (including the zero-byte at the end) of the
                    string to be read.
    @param[in,out] result  A reference to the string variable where the read
                            string is to be stored.
    @tparam NumberType The type of the length @a len
    @pre len >= 1
    @return `true` if the string was successfully parsed
    */
    template<typename NumberType>
    bool get_bson_string(const NumberType len, string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(len < 1))
        {
            auto last_token = get_token_string();
            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string"), BasicJsonType()));
        }

        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != std::char_traits<char_type>::eof();
    }

    /*!
    @brief Parses a byte array input of length @a len from the BSON input.
    @param[in] len  The length of the byte array to be read.
    @param[in,out] result  A reference to the binary variable where the read
                            array is to be stored.
    @tparam NumberType The type of the length @a len
    @pre len >= 0
    @return `true` if the byte array was successfully parsed
    */
    template<typename NumberType>
    bool get_bson_binary(const NumberType len, binary_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(len < 0))
        {
            auto last_token = get_token_string();
            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "byte array length cannot be negative, is " + std::to_string(len), "binary"), BasicJsonType()));
        }

        // All BSON binary values have a subtype
        std::uint8_t subtype{};
        get_number<std::uint8_t>(input_format_t::bson, subtype);
        result.set_subtype(subtype);

        return get_binary(input_format_t::bson, len, result);
    }

    /*!
    @brief Read a BSON document element of the given @a element_type.
    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
    @param[in] element_type_parse_position The position in the input stream,
               where the `element_type` was read.
    @warning Not all BSON element types are supported yet. An unsupported
             @a element_type will give rise to a parse_error.114:
             Unsupported BSON record type 0x...
    @return whether a valid BSON-object/array was passed to the SAX parser
    */
    bool parse_bson_element_internal(const char_int_type element_type,
                                     const std::size_t element_type_parse_position)
    {
        switch (element_type)
        {
            case 0x01: // double
            {
                double number{};
                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0x02: // string
            {
                std::int32_t len{};
                string_t value;
                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
            }

            case 0x03: // object
            {
                return parse_bson_internal();
            }

            case 0x04: // array
            {
                return parse_bson_array();
            }

            case 0x05: // binary
            {
                std::int32_t len{};
                binary_t value;
                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
            }

            case 0x08: // boolean
            {
                return sax->boolean(get() != 0);
            }

            case 0x0A: // null
            {
                return sax->null();
            }

            case 0x10: // int32
            {
                std::int32_t value{};
                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
            }

            case 0x12: // int64
            {
                std::int64_t value{};
                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
            }

            default: // anything else not supported (yet)
            {
                std::array<char, 3> cr{{}};
                (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data()), BasicJsonType()));
            }
        }
    }

    /*!
    @brief Read a BSON element list (as specified in the BSON-spec)

    The same binary layout is used for objects and arrays, hence it must be
    indicated with the argument @a is_array which one is expected
    (true --> array, false --> object).

    @param[in] is_array Determines if the element list being read is to be
                        treated as an object (@a is_array == false), or as an
                        array (@a is_array == true).
    @return whether a valid BSON-object/array was passed to the SAX parser
    */
    bool parse_bson_element_list(const bool is_array)
    {
        string_t key;

        while (auto element_type = get())
        {
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
            {
                return false;
            }

            const std::size_t element_type_parse_position = chars_read;
            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
            {
                return false;
            }

            if (!is_array && !sax->key(key))
            {
                return false;
            }

            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
            {
                return false;
            }

            // get_bson_cstr only appends
            key.clear();
        }

        return true;
    }

    /*!
    @brief Reads an array from the BSON input and passes it to the SAX-parser.
    @return whether a valid BSON-array was passed to the SAX parser
    */
    bool parse_bson_array()
    {
        std::int32_t document_size{};
        get_number<std::int32_t, true>(input_format_t::bson, document_size);

        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
        {
            return false;
        }

        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
        {
            return false;
        }

        return sax->end_array();
    }

    //////////
    // CBOR //
    //////////

    /*!
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true) or whether the last read character should
                         be considered instead (false)
    @param[in] tag_handler how CBOR tags should be treated

    @return whether a valid CBOR value was passed to the SAX parser
    */
    bool parse_cbor_internal(const bool get_char,
                             const cbor_tag_handler_t tag_handler)
    {
        switch (get_char ? get() : current)
        {
            // EOF
            case std::char_traits<char_type>::eof():
                return unexpect_eof(input_format_t::cbor, "value");

            // Integer 0x00..0x17 (0..23)
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07:
            case 0x08:
            case 0x09:
            case 0x0A:
            case 0x0B:
            case 0x0C:
            case 0x0D:
            case 0x0E:
            case 0x0F:
            case 0x10:
            case 0x11:
            case 0x12:
            case 0x13:
            case 0x14:
            case 0x15:
            case 0x16:
            case 0x17:
                return sax->number_unsigned(static_cast<number_unsigned_t>(current));

            case 0x18: // Unsigned integer (one-byte uint8_t follows)
            {
                std::uint8_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x19: // Unsigned integer (two-byte uint16_t follows)
            {
                std::uint16_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
            {
                std::uint32_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
            {
                std::uint64_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
            }

            // Negative integer -1-0x00..-1-0x17 (-1..-24)
            case 0x20:
            case 0x21:
            case 0x22:
            case 0x23:
            case 0x24:
            case 0x25:
            case 0x26:
            case 0x27:
            case 0x28:
            case 0x29:
            case 0x2A:
            case 0x2B:
            case 0x2C:
            case 0x2D:
            case 0x2E:
            case 0x2F:
            case 0x30:
            case 0x31:
            case 0x32:
            case 0x33:
            case 0x34:
            case 0x35:
            case 0x36:
            case 0x37:
                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));

            case 0x38: // Negative integer (one-byte uint8_t follows)
            {
                std::uint8_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
            {
                std::uint16_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
            {
                std::uint32_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
            }

            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
            {
                std::uint64_t number{};
                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
                        - static_cast<number_integer_t>(number));
            }

            // Binary data (0x00..0x17 bytes follow)
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            case 0x58: // Binary data (one-byte uint8_t for n follows)
            case 0x59: // Binary data (two-byte uint16_t for n follow)
            case 0x5A: // Binary data (four-byte uint32_t for n follow)
            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
            case 0x5F: // Binary data (indefinite length)
            {
                binary_t b;
                return get_cbor_binary(b) && sax->binary(b);
            }

            // UTF-8 string (0x00..0x17 bytes follow)
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
            case 0x7F: // UTF-8 string (indefinite length)
            {
                string_t s;
                return get_cbor_string(s) && sax->string(s);
            }

            // array (0x00..0x17 data items follow)
            case 0x80:
            case 0x81:
            case 0x82:
            case 0x83:
            case 0x84:
            case 0x85:
            case 0x86:
            case 0x87:
            case 0x88:
            case 0x89:
            case 0x8A:
            case 0x8B:
            case 0x8C:
            case 0x8D:
            case 0x8E:
            case 0x8F:
            case 0x90:
            case 0x91:
            case 0x92:
            case 0x93:
            case 0x94:
            case 0x95:
            case 0x96:
            case 0x97:
                return get_cbor_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);

            case 0x98: // array (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x99: // array (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9A: // array (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9B: // array (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
            }

            case 0x9F: // array (indefinite length)
                return get_cbor_array(std::size_t(-1), tag_handler);

            // map (0x00..0x17 pairs of data items follow)
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
                return get_cbor_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);

            case 0xB8: // map (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xB9: // map (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBA: // map (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBB: // map (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
            }

            case 0xBF: // map (indefinite length)
                return get_cbor_object(std::size_t(-1), tag_handler);

            case 0xC6: // tagged item
            case 0xC7:
            case 0xC8:
            case 0xC9:
            case 0xCA:
            case 0xCB:
            case 0xCC:
            case 0xCD:
            case 0xCE:
            case 0xCF:
            case 0xD0:
            case 0xD1:
            case 0xD2:
            case 0xD3:
            case 0xD4:
            case 0xD8: // tagged item (1 bytes follow)
            case 0xD9: // tagged item (2 bytes follow)
            case 0xDA: // tagged item (4 bytes follow)
            case 0xDB: // tagged item (8 bytes follow)
            {
                switch (tag_handler)
                {
                    case cbor_tag_handler_t::error:
                    {
                        auto last_token = get_token_string();
                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
                    }

                    case cbor_tag_handler_t::ignore:
                    {
                        switch (current)
                        {
                            case 0xD8:
                            {
                                std::uint8_t len{};
                                get_number(input_format_t::cbor, len);
                                break;
                            }
                            case 0xD9:
                            {
                                std::uint16_t len{};
                                get_number(input_format_t::cbor, len);
                                break;
                            }
                            case 0xDA:
                            {
                                std::uint32_t len{};
                                get_number(input_format_t::cbor, len);
                                break;
                            }
                            case 0xDB:
                            {
                                std::uint64_t len{};
                                get_number(input_format_t::cbor, len);
                                break;
                            }
                            default:
                                break;
                        }
                        return parse_cbor_internal(true, tag_handler);
                    }

                    default:                 // LCOV_EXCL_LINE
                        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                        return false;        // LCOV_EXCL_LINE
                }
            }

            case 0xF4: // false
                return sax->boolean(false);

            case 0xF5: // true
                return sax->boolean(true);

            case 0xF6: // null
                return sax->null();

            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
            {
                const auto byte1_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
                {
                    return false;
                }
                const auto byte2_raw = get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
                {
                    return false;
                }

                const auto byte1 = static_cast<unsigned char>(byte1_raw);
                const auto byte2 = static_cast<unsigned char>(byte2_raw);

                // code from RFC 7049, Appendix D, Figure 3:
                // As half-precision floating-point numbers were only added
                // to IEEE 754 in 2008, today's programming platforms often
                // still only have limited support for them. It is very
                // easy to include at least decoding support for them even
                // without such support. An example of a small decoder for
                // half-precision floating-point numbers in the C language
                // is shown in Fig. 3.
                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
                const double val = [&half]
                {
                    const int exp = (half >> 10u) & 0x1Fu;
                    const unsigned int mant = half & 0x3FFu;
                    JSON_ASSERT(0 <= exp&& exp <= 32);
                    JSON_ASSERT(mant <= 1024);
                    switch (exp)
                    {
                        case 0:
                            return std::ldexp(mant, -24);
                        case 31:
                            return (mant == 0)
                            ? std::numeric_limits<double>::infinity()
                            : std::numeric_limits<double>::quiet_NaN();
                        default:
                            return std::ldexp(mant + 1024, exp - 25);
                    }
                }();
                return sax->number_float((half & 0x8000u) != 0
                                         ? static_cast<number_float_t>(-val)
                                         : static_cast<number_float_t>(val), "");
            }

            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
            {
                float number{};
                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
            {
                double number{};
                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            default: // anything else (0xFF is handled inside the other types)
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
            }
        }
    }

    /*!
    @brief reads a CBOR string

    This function first reads starting bytes to determine the expected
    string length and then copies this number of bytes into a string.
    Additionally, CBOR's strings with indefinite lengths are supported.

    @param[out] result  created string

    @return whether string creation completed
    */
    bool get_cbor_string(string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
        {
            return false;
        }

        switch (current)
        {
            // UTF-8 string (0x00..0x17 bytes follow)
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            {
                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
            }

            case 0x7F: // UTF-8 string (indefinite length)
            {
                while (get() != 0xFF)
                {
                    string_t chunk;
                    if (!get_cbor_string(chunk))
                    {
                        return false;
                    }
                    result.append(chunk);
                }
                return true;
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string"), BasicJsonType()));
            }
        }
    }

    /*!
    @brief reads a CBOR byte array

    This function first reads starting bytes to determine the expected
    byte array length and then copies this number of bytes into the byte array.
    Additionally, CBOR's byte arrays with indefinite lengths are supported.

    @param[out] result  created byte array

    @return whether byte array creation completed
    */
    bool get_cbor_binary(binary_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
        {
            return false;
        }

        switch (current)
        {
            // Binary data (0x00..0x17 bytes follow)
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            {
                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0x58: // Binary data (one-byte uint8_t for n follows)
            {
                std::uint8_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x59: // Binary data (two-byte uint16_t for n follow)
            {
                std::uint16_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5A: // Binary data (four-byte uint32_t for n follow)
            {
                std::uint32_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
            {
                std::uint64_t len{};
                return get_number(input_format_t::cbor, len) &&
                       get_binary(input_format_t::cbor, len, result);
            }

            case 0x5F: // Binary data (indefinite length)
            {
                while (get() != 0xFF)
                {
                    binary_t chunk;
                    if (!get_cbor_binary(chunk))
                    {
                        return false;
                    }
                    result.insert(result.end(), chunk.begin(), chunk.end());
                }
                return true;
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x" + last_token, "binary"), BasicJsonType()));
            }
        }
    }

    /*!
    @param[in] len  the length of the array or std::size_t(-1) for an
                    array of indefinite size
    @param[in] tag_handler how CBOR tags should be treated
    @return whether array creation completed
    */
    bool get_cbor_array(const std::size_t len,
                        const cbor_tag_handler_t tag_handler)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
        {
            return false;
        }

        if (len != std::size_t(-1))
        {
            for (std::size_t i = 0; i < len; ++i)
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                {
                    return false;
                }
            }
        }
        else
        {
            while (get() != 0xFF)
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
                {
                    return false;
                }
            }
        }

        return sax->end_array();
    }

    /*!
    @param[in] len  the length of the object or std::size_t(-1) for an
                    object of indefinite size
    @param[in] tag_handler how CBOR tags should be treated
    @return whether object creation completed
    */
    bool get_cbor_object(const std::size_t len,
                         const cbor_tag_handler_t tag_handler)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
        {
            return false;
        }

        string_t key;
        if (len != std::size_t(-1))
        {
            for (std::size_t i = 0; i < len; ++i)
            {
                get();
                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
                {
                    return false;
                }

                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                {
                    return false;
                }
                key.clear();
            }
        }
        else
        {
            while (get() != 0xFF)
            {
                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
                {
                    return false;
                }

                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
                {
                    return false;
                }
                key.clear();
            }
        }

        return sax->end_object();
    }

    /////////////
    // MsgPack //
    /////////////

    /*!
    @return whether a valid MessagePack value was passed to the SAX parser
    */
    bool parse_msgpack_internal()
    {
        switch (get())
        {
            // EOF
            case std::char_traits<char_type>::eof():
                return unexpect_eof(input_format_t::msgpack, "value");

            // positive fixint
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07:
            case 0x08:
            case 0x09:
            case 0x0A:
            case 0x0B:
            case 0x0C:
            case 0x0D:
            case 0x0E:
            case 0x0F:
            case 0x10:
            case 0x11:
            case 0x12:
            case 0x13:
            case 0x14:
            case 0x15:
            case 0x16:
            case 0x17:
            case 0x18:
            case 0x19:
            case 0x1A:
            case 0x1B:
            case 0x1C:
            case 0x1D:
            case 0x1E:
            case 0x1F:
            case 0x20:
            case 0x21:
            case 0x22:
            case 0x23:
            case 0x24:
            case 0x25:
            case 0x26:
            case 0x27:
            case 0x28:
            case 0x29:
            case 0x2A:
            case 0x2B:
            case 0x2C:
            case 0x2D:
            case 0x2E:
            case 0x2F:
            case 0x30:
            case 0x31:
            case 0x32:
            case 0x33:
            case 0x34:
            case 0x35:
            case 0x36:
            case 0x37:
            case 0x38:
            case 0x39:
            case 0x3A:
            case 0x3B:
            case 0x3C:
            case 0x3D:
            case 0x3E:
            case 0x3F:
            case 0x40:
            case 0x41:
            case 0x42:
            case 0x43:
            case 0x44:
            case 0x45:
            case 0x46:
            case 0x47:
            case 0x48:
            case 0x49:
            case 0x4A:
            case 0x4B:
            case 0x4C:
            case 0x4D:
            case 0x4E:
            case 0x4F:
            case 0x50:
            case 0x51:
            case 0x52:
            case 0x53:
            case 0x54:
            case 0x55:
            case 0x56:
            case 0x57:
            case 0x58:
            case 0x59:
            case 0x5A:
            case 0x5B:
            case 0x5C:
            case 0x5D:
            case 0x5E:
            case 0x5F:
            case 0x60:
            case 0x61:
            case 0x62:
            case 0x63:
            case 0x64:
            case 0x65:
            case 0x66:
            case 0x67:
            case 0x68:
            case 0x69:
            case 0x6A:
            case 0x6B:
            case 0x6C:
            case 0x6D:
            case 0x6E:
            case 0x6F:
            case 0x70:
            case 0x71:
            case 0x72:
            case 0x73:
            case 0x74:
            case 0x75:
            case 0x76:
            case 0x77:
            case 0x78:
            case 0x79:
            case 0x7A:
            case 0x7B:
            case 0x7C:
            case 0x7D:
            case 0x7E:
            case 0x7F:
                return sax->number_unsigned(static_cast<number_unsigned_t>(current));

            // fixmap
            case 0x80:
            case 0x81:
            case 0x82:
            case 0x83:
            case 0x84:
            case 0x85:
            case 0x86:
            case 0x87:
            case 0x88:
            case 0x89:
            case 0x8A:
            case 0x8B:
            case 0x8C:
            case 0x8D:
            case 0x8E:
            case 0x8F:
                return get_msgpack_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));

            // fixarray
            case 0x90:
            case 0x91:
            case 0x92:
            case 0x93:
            case 0x94:
            case 0x95:
            case 0x96:
            case 0x97:
            case 0x98:
            case 0x99:
            case 0x9A:
            case 0x9B:
            case 0x9C:
            case 0x9D:
            case 0x9E:
            case 0x9F:
                return get_msgpack_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));

            // fixstr
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
            case 0xB8:
            case 0xB9:
            case 0xBA:
            case 0xBB:
            case 0xBC:
            case 0xBD:
            case 0xBE:
            case 0xBF:
            case 0xD9: // str 8
            case 0xDA: // str 16
            case 0xDB: // str 32
            {
                string_t s;
                return get_msgpack_string(s) && sax->string(s);
            }

            case 0xC0: // nil
                return sax->null();

            case 0xC2: // false
                return sax->boolean(false);

            case 0xC3: // true
                return sax->boolean(true);

            case 0xC4: // bin 8
            case 0xC5: // bin 16
            case 0xC6: // bin 32
            case 0xC7: // ext 8
            case 0xC8: // ext 16
            case 0xC9: // ext 32
            case 0xD4: // fixext 1
            case 0xD5: // fixext 2
            case 0xD6: // fixext 4
            case 0xD7: // fixext 8
            case 0xD8: // fixext 16
            {
                binary_t b;
                return get_msgpack_binary(b) && sax->binary(b);
            }

            case 0xCA: // float 32
            {
                float number{};
                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xCB: // float 64
            {
                double number{};
                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 0xCC: // uint 8
            {
                std::uint8_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCD: // uint 16
            {
                std::uint16_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCE: // uint 32
            {
                std::uint32_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xCF: // uint 64
            {
                std::uint64_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
            }

            case 0xD0: // int 8
            {
                std::int8_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD1: // int 16
            {
                std::int16_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD2: // int 32
            {
                std::int32_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xD3: // int 64
            {
                std::int64_t number{};
                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
            }

            case 0xDC: // array 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
            }

            case 0xDD: // array 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
            }

            case 0xDE: // map 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
            }

            case 0xDF: // map 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
            }

            // negative fixint
            case 0xE0:
            case 0xE1:
            case 0xE2:
            case 0xE3:
            case 0xE4:
            case 0xE5:
            case 0xE6:
            case 0xE7:
            case 0xE8:
            case 0xE9:
            case 0xEA:
            case 0xEB:
            case 0xEC:
            case 0xED:
            case 0xEE:
            case 0xEF:
            case 0xF0:
            case 0xF1:
            case 0xF2:
            case 0xF3:
            case 0xF4:
            case 0xF5:
            case 0xF6:
            case 0xF7:
            case 0xF8:
            case 0xF9:
            case 0xFA:
            case 0xFB:
            case 0xFC:
            case 0xFD:
            case 0xFE:
            case 0xFF:
                return sax->number_integer(static_cast<std::int8_t>(current));

            default: // anything else
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
            }
        }
    }

    /*!
    @brief reads a MessagePack string

    This function first reads starting bytes to determine the expected
    string length and then copies this number of bytes into a string.

    @param[out] result  created string

    @return whether string creation completed
    */
    bool get_msgpack_string(string_t& result)
    {
        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
        {
            return false;
        }

        switch (current)
        {
            // fixstr
            case 0xA0:
            case 0xA1:
            case 0xA2:
            case 0xA3:
            case 0xA4:
            case 0xA5:
            case 0xA6:
            case 0xA7:
            case 0xA8:
            case 0xA9:
            case 0xAA:
            case 0xAB:
            case 0xAC:
            case 0xAD:
            case 0xAE:
            case 0xAF:
            case 0xB0:
            case 0xB1:
            case 0xB2:
            case 0xB3:
            case 0xB4:
            case 0xB5:
            case 0xB6:
            case 0xB7:
            case 0xB8:
            case 0xB9:
            case 0xBA:
            case 0xBB:
            case 0xBC:
            case 0xBD:
            case 0xBE:
            case 0xBF:
            {
                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
            }

            case 0xD9: // str 8
            {
                std::uint8_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            case 0xDA: // str 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            case 0xDB: // str 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string"), BasicJsonType()));
            }
        }
    }

    /*!
    @brief reads a MessagePack byte array

    This function first reads starting bytes to determine the expected
    byte array length and then copies this number of bytes into a byte array.

    @param[out] result  created byte array

    @return whether byte array creation completed
    */
    bool get_msgpack_binary(binary_t& result)
    {
        // helper function to set the subtype
        auto assign_and_return_true = [&result](std::int8_t subtype)
        {
            result.set_subtype(static_cast<std::uint8_t>(subtype));
            return true;
        };

        switch (current)
        {
            case 0xC4: // bin 8
            {
                std::uint8_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC5: // bin 16
            {
                std::uint16_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC6: // bin 32
            {
                std::uint32_t len{};
                return get_number(input_format_t::msgpack, len) &&
                       get_binary(input_format_t::msgpack, len, result);
            }

            case 0xC7: // ext 8
            {
                std::uint8_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xC8: // ext 16
            {
                std::uint16_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xC9: // ext 32
            {
                std::uint32_t len{};
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, len) &&
                       get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, len, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD4: // fixext 1
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 1, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD5: // fixext 2
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 2, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD6: // fixext 4
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 4, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD7: // fixext 8
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 8, result) &&
                       assign_and_return_true(subtype);
            }

            case 0xD8: // fixext 16
            {
                std::int8_t subtype{};
                return get_number(input_format_t::msgpack, subtype) &&
                       get_binary(input_format_t::msgpack, 16, result) &&
                       assign_and_return_true(subtype);
            }

            default:           // LCOV_EXCL_LINE
                return false;  // LCOV_EXCL_LINE
        }
    }

    /*!
    @param[in] len  the length of the array
    @return whether array creation completed
    */
    bool get_msgpack_array(const std::size_t len)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
        {
            return false;
        }

        for (std::size_t i = 0; i < len; ++i)
        {
            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
            {
                return false;
            }
        }

        return sax->end_array();
    }

    /*!
    @param[in] len  the length of the object
    @return whether object creation completed
    */
    bool get_msgpack_object(const std::size_t len)
    {
        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
        {
            return false;
        }

        string_t key;
        for (std::size_t i = 0; i < len; ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
            {
                return false;
            }

            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
            {
                return false;
            }
            key.clear();
        }

        return sax->end_object();
    }

    ////////////
    // UBJSON //
    ////////////

    /*!
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true, default) or whether the last read
                         character should be considered instead

    @return whether a valid UBJSON value was passed to the SAX parser
    */
    bool parse_ubjson_internal(const bool get_char = true)
    {
        return get_ubjson_value(get_char ? get_ignore_noop() : current);
    }

    /*!
    @brief reads a UBJSON string

    This function is either called after reading the 'S' byte explicitly
    indicating a string, or in case of an object key where the 'S' byte can be
    left out.

    @param[out] result   created string
    @param[in] get_char  whether a new character should be retrieved from the
                         input (true, default) or whether the last read
                         character should be considered instead

    @return whether string creation completed
    */
    bool get_ubjson_string(string_t& result, const bool get_char = true)
    {
        if (get_char)
        {
            get();  // TODO(niels): may we ignore N here?
        }

        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
        {
            return false;
        }

        switch (current)
        {
            case 'U':
            {
                std::uint8_t len{};
                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
            }

            case 'i':
            {
                std::int8_t len{};
                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
            }

            case 'I':
            {
                std::int16_t len{};
                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
            }

            case 'l':
            {
                std::int32_t len{};
                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
            }

            case 'L':
            {
                std::int64_t len{};
                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
            }

            default:
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string"), BasicJsonType()));
        }
    }

    /*!
    @param[out] result  determined size
    @return whether size determination completed
    */
    bool get_ubjson_size_value(std::size_t& result)
    {
        switch (get_ignore_noop())
        {
            case 'U':
            {
                std::uint8_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'i':
            {
                std::int8_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
                return true;
            }

            case 'I':
            {
                std::int16_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'l':
            {
                std::int32_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            case 'L':
            {
                std::int64_t number{};
                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
                {
                    return false;
                }
                result = static_cast<std::size_t>(number);
                return true;
            }

            default:
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size"), BasicJsonType()));
            }
        }
    }

    /*!
    @brief determine the type and size for a container

    In the optimized UBJSON format, a type and a size can be provided to allow
    for a more compact representation.

    @param[out] result  pair of the size and the type

    @return whether pair creation completed
    */
    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result)
    {
        result.first = string_t::npos; // size
        result.second = 0; // type

        get_ignore_noop();

        if (current == '$')
        {
            result.second = get();  // must not ignore 'N', because 'N' maybe the type
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type")))
            {
                return false;
            }

            get_ignore_noop();
            if (JSON_HEDLEY_UNLIKELY(current != '#'))
            {
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
                {
                    return false;
                }
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size"), BasicJsonType()));
            }

            return get_ubjson_size_value(result.first);
        }

        if (current == '#')
        {
            return get_ubjson_size_value(result.first);
        }

        return true;
    }

    /*!
    @param prefix  the previously read or set type prefix
    @return whether value creation completed
    */
    bool get_ubjson_value(const char_int_type prefix)
    {
        switch (prefix)
        {
            case std::char_traits<char_type>::eof():  // EOF
                return unexpect_eof(input_format_t::ubjson, "value");

            case 'T':  // true
                return sax->boolean(true);
            case 'F':  // false
                return sax->boolean(false);

            case 'Z':  // null
                return sax->null();

            case 'U':
            {
                std::uint8_t number{};
                return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number);
            }

            case 'i':
            {
                std::int8_t number{};
                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
            }

            case 'I':
            {
                std::int16_t number{};
                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
            }

            case 'l':
            {
                std::int32_t number{};
                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
            }

            case 'L':
            {
                std::int64_t number{};
                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
            }

            case 'd':
            {
                float number{};
                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 'D':
            {
                double number{};
                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
            }

            case 'H':
            {
                return get_ubjson_high_precision_number();
            }

            case 'C':  // char
            {
                get();
                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char")))
                {
                    return false;
                }
                if (JSON_HEDLEY_UNLIKELY(current > 127))
                {
                    auto last_token = get_token_string();
                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char"), BasicJsonType()));
                }
                string_t s(1, static_cast<typename string_t::value_type>(current));
                return sax->string(s);
            }

            case 'S':  // string
            {
                string_t s;
                return get_ubjson_string(s) && sax->string(s);
            }

            case '[':  // array
                return get_ubjson_array();

            case '{':  // object
                return get_ubjson_object();

            default: // anything else
            {
                auto last_token = get_token_string();
                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
            }
        }
    }

    /*!
    @return whether array creation completed
    */
    bool get_ubjson_array()
    {
        std::pair<std::size_t, char_int_type> size_and_type;
        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
        {
            return false;
        }

        if (size_and_type.first != string_t::npos)
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
            {
                return false;
            }

            if (size_and_type.second != 0)
            {
                if (size_and_type.second != 'N')
                {
                    for (std::size_t i = 0; i < size_and_type.first; ++i)
                    {
                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
                        {
                            return false;
                        }
                    }
                }
            }
            else
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                    {
                        return false;
                    }
                }
            }
        }
        else
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
            {
                return false;
            }

            while (current != ']')
            {
                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
                {
                    return false;
                }
                get_ignore_noop();
            }
        }

        return sax->end_array();
    }

    /*!
    @return whether object creation completed
    */
    bool get_ubjson_object()
    {
        std::pair<std::size_t, char_int_type> size_and_type;
        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
        {
            return false;
        }

        string_t key;
        if (size_and_type.first != string_t::npos)
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
            {
                return false;
            }

            if (size_and_type.second != 0)
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
                    {
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
            else
            {
                for (std::size_t i = 0; i < size_and_type.first; ++i)
                {
                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
                    {
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                    {
                        return false;
                    }
                    key.clear();
                }
            }
        }
        else
        {
            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
            {
                return false;
            }

            while (current != '}')
            {
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
                {
                    return false;
                }
                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
                {
                    return false;
                }
                get_ignore_noop();
                key.clear();
            }
        }

        return sax->end_object();
    }

    // Note, no reader for UBJSON binary types is implemented because they do
    // not exist

    bool get_ubjson_high_precision_number()
    {
        // get size of following number string
        std::size_t size{};
        auto res = get_ubjson_size_value(size);
        if (JSON_HEDLEY_UNLIKELY(!res))
        {
            return res;
        }

        // get number string
        std::vector<char> number_vector;
        for (std::size_t i = 0; i < size; ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number")))
            {
                return false;
            }
            number_vector.push_back(static_cast<char>(current));
        }

        // parse number string
        using ia_type = decltype(detail::input_adapter(number_vector));
        auto number_lexer = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
        const auto result_number = number_lexer.scan();
        const auto number_string = number_lexer.get_token_string();
        const auto result_remainder = number_lexer.scan();

        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;

        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
        {
            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType()));
        }

        switch (result_number)
        {
            case token_type::value_integer:
                return sax->number_integer(number_lexer.get_number_integer());
            case token_type::value_unsigned:
                return sax->number_unsigned(number_lexer.get_number_unsigned());
            case token_type::value_float:
                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
            default:
                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType()));
        }
    }

    ///////////////////////
    // Utility functions //
    ///////////////////////

    /*!
    @brief get next character from the input

    This function provides the interface to the used input adapter. It does
    not throw in case the input reached EOF, but returns a -'ve valued
    `std::char_traits<char_type>::eof()` in that case.

    @return character read from the input
    */
    char_int_type get()
    {
        ++chars_read;
        return current = ia.get_character();
    }

    /*!
    @return character read from the input after ignoring all 'N' entries
    */
    char_int_type get_ignore_noop()
    {
        do
        {
            get();
        }
        while (current == 'N');

        return current;
    }

    /*
    @brief read a number from the input

    @tparam NumberType the type of the number
    @param[in] format   the current format (for diagnostics)
    @param[out] result  number of type @a NumberType

    @return whether conversion completed

    @note This function needs to respect the system's endianess, because
          bytes in CBOR, MessagePack, and UBJSON are stored in network order
          (big endian) and therefore need reordering on little endian systems.
    */
    template<typename NumberType, bool InputIsLittleEndian = false>
    bool get_number(const input_format_t format, NumberType& result)
    {
        // step 1: read input into array with system's byte order
        std::array<std::uint8_t, sizeof(NumberType)> vec{};
        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
            {
                return false;
            }

            // reverse byte order prior to conversion if necessary
            if (is_little_endian != InputIsLittleEndian)
            {
                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
            }
            else
            {
                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
            }
        }

        // step 2: convert array into number of type T and return
        std::memcpy(&result, vec.data(), sizeof(NumberType));
        return true;
    }

    /*!
    @brief create a string by reading characters from the input

    @tparam NumberType the type of the number
    @param[in] format the current format (for diagnostics)
    @param[in] len number of characters to read
    @param[out] result string created by reading @a len bytes

    @return whether string creation completed

    @note We can not reserve @a len bytes for the result, because @a len
          may be too large. Usually, @ref unexpect_eof() detects the end of
          the input before we run out of string memory.
    */
    template<typename NumberType>
    bool get_string(const input_format_t format,
                    const NumberType len,
                    string_t& result)
    {
        bool success = true;
        for (NumberType i = 0; i < len; i++)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
            {
                success = false;
                break;
            }
            result.push_back(static_cast<typename string_t::value_type>(current));
        }
        return success;
    }

    /*!
    @brief create a byte array by reading bytes from the input

    @tparam NumberType the type of the number
    @param[in] format the current format (for diagnostics)
    @param[in] len number of bytes to read
    @param[out] result byte array created by reading @a len bytes

    @return whether byte array creation completed

    @note We can not reserve @a len bytes for the result, because @a len
          may be too large. Usually, @ref unexpect_eof() detects the end of
          the input before we run out of memory.
    */
    template<typename NumberType>
    bool get_binary(const input_format_t format,
                    const NumberType len,
                    binary_t& result)
    {
        bool success = true;
        for (NumberType i = 0; i < len; i++)
        {
            get();
            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
            {
                success = false;
                break;
            }
            result.push_back(static_cast<std::uint8_t>(current));
        }
        return success;
    }

    /*!
    @param[in] format   the current format (for diagnostics)
    @param[in] context  further context information (for diagnostics)
    @return whether the last read character is not EOF
    */
    JSON_HEDLEY_NON_NULL(3)
    bool unexpect_eof(const input_format_t format, const char* context) const
    {
        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char_type>::eof()))
        {
            return sax->parse_error(chars_read, "<end of file>",
                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), BasicJsonType()));
        }
        return true;
    }

    /*!
    @return a string representation of the last read byte
    */
    std::string get_token_string() const
    {
        std::array<char, 3> cr{{}};
        (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        return std::string{cr.data()};
    }

    /*!
    @param[in] format   the current format
    @param[in] detail   a detailed error message
    @param[in] context  further context information
    @return a message string to use in the parse_error exceptions
    */
    std::string exception_message(const input_format_t format,
                                  const std::string& detail,
                                  const std::string& context) const
    {
        std::string error_msg = "syntax error while parsing ";

        switch (format)
        {
            case input_format_t::cbor:
                error_msg += "CBOR";
                break;

            case input_format_t::msgpack:
                error_msg += "MessagePack";
                break;

            case input_format_t::ubjson:
                error_msg += "UBJSON";
                break;

            case input_format_t::bson:
                error_msg += "BSON";
                break;

            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }

        return error_msg + " " + context + ": " + detail;
    }

  private:
    /// input adapter
    InputAdapterType ia;

    /// the current character
    char_int_type current = std::char_traits<char_type>::eof();

    /// the number of characters read
    std::size_t chars_read = 0;

    /// whether we can assume little endianess
    const bool is_little_endian = little_endianess();

    /// the SAX parser
    json_sax_t* sax = nullptr;
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/lexer.hpp>

// #include <nlohmann/detail/input/parser.hpp>


#include <cmath> // isfinite
#include <cstdint> // uint8_t
#include <functional> // function
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/input/input_adapters.hpp>

// #include <nlohmann/detail/input/json_sax.hpp>

// #include <nlohmann/detail/input/lexer.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/is_sax.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
////////////
// parser //
////////////

enum class parse_event_t : uint8_t
{
    /// the parser read `{` and started to process a JSON object
    object_start,
    /// the parser read `}` and finished processing a JSON object
    object_end,
    /// the parser read `[` and started to process a JSON array
    array_start,
    /// the parser read `]` and finished processing a JSON array
    array_end,
    /// the parser read a key of a value in an object
    key,
    /// the parser finished reading a JSON value
    value
};

template<typename BasicJsonType>
using parser_callback_t =
    std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;

/*!
@brief syntax analysis

This class implements a recursive descent parser.
*/
template<typename BasicJsonType, typename InputAdapterType>
class parser
{
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using string_t = typename BasicJsonType::string_t;
    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
    using token_type = typename lexer_t::token_type;

  public:
    /// a parser reading from an input adapter
    explicit parser(InputAdapterType&& adapter,
                    const parser_callback_t<BasicJsonType> cb = nullptr,
                    const bool allow_exceptions_ = true,
                    const bool skip_comments = false)
        : callback(cb)
        , m_lexer(std::move(adapter), skip_comments)
        , allow_exceptions(allow_exceptions_)
    {
        // read first token
        get_token();
    }

    /*!
    @brief public parser interface

    @param[in] strict      whether to expect the last token to be EOF
    @param[in,out] result  parsed JSON value

    @throw parse_error.101 in case of an unexpected token
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails
    */
    void parse(const bool strict, BasicJsonType& result)
    {
        if (callback)
        {
            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
            sax_parse_internal(&sdp);

            // in strict mode, input must be completely read
            if (strict && (get_token() != token_type::end_of_input))
            {
                sdp.parse_error(m_lexer.get_position(),
                                m_lexer.get_token_string(),
                                parse_error::create(101, m_lexer.get_position(),
                                                    exception_message(token_type::end_of_input, "value"), BasicJsonType()));
            }

            // in case of an error, return discarded value
            if (sdp.is_errored())
            {
                result = value_t::discarded;
                return;
            }

            // set top-level value to null if it was discarded by the callback
            // function
            if (result.is_discarded())
            {
                result = nullptr;
            }
        }
        else
        {
            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
            sax_parse_internal(&sdp);

            // in strict mode, input must be completely read
            if (strict && (get_token() != token_type::end_of_input))
            {
                sdp.parse_error(m_lexer.get_position(),
                                m_lexer.get_token_string(),
                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType()));
            }

            // in case of an error, return discarded value
            if (sdp.is_errored())
            {
                result = value_t::discarded;
                return;
            }
        }

        result.assert_invariant();
    }

    /*!
    @brief public accept interface

    @param[in] strict  whether to expect the last token to be EOF
    @return whether the input is a proper JSON text
    */
    bool accept(const bool strict = true)
    {
        json_sax_acceptor<BasicJsonType> sax_acceptor;
        return sax_parse(&sax_acceptor, strict);
    }

    template<typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    bool sax_parse(SAX* sax, const bool strict = true)
    {
        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
        const bool result = sax_parse_internal(sax);

        // strict mode: next byte must be EOF
        if (result && strict && (get_token() != token_type::end_of_input))
        {
            return sax->parse_error(m_lexer.get_position(),
                                    m_lexer.get_token_string(),
                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType()));
        }

        return result;
    }

  private:
    template<typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    bool sax_parse_internal(SAX* sax)
    {
        // stack to remember the hierarchy of structured values we are parsing
        // true = array; false = object
        std::vector<bool> states;
        // value to avoid a goto (see comment where set to true)
        bool skip_to_state_evaluation = false;

        while (true)
        {
            if (!skip_to_state_evaluation)
            {
                // invariant: get_token() was called before each iteration
                switch (last_token)
                {
                    case token_type::begin_object:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
                        {
                            return false;
                        }

                        // closing } -> we are done
                        if (get_token() == token_type::end_object)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
                            {
                                return false;
                            }
                            break;
                        }

                        // parse key
                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType()));
                        }
                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
                        {
                            return false;
                        }

                        // parse separator (:)
                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType()));
                        }

                        // remember we are now inside an object
                        states.push_back(false);

                        // parse values
                        get_token();
                        continue;
                    }

                    case token_type::begin_array:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
                        {
                            return false;
                        }

                        // closing ] -> we are done
                        if (get_token() == token_type::end_array)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
                            {
                                return false;
                            }
                            break;
                        }

                        // remember we are now inside an array
                        states.push_back(true);

                        // parse values (no need to call get_token)
                        continue;
                    }

                    case token_type::value_float:
                    {
                        const auto res = m_lexer.get_number_float();

                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
                        {
                            return sax->parse_error(m_lexer.get_position(),
                                                    m_lexer.get_token_string(),
                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'", BasicJsonType()));
                        }

                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
                        {
                            return false;
                        }

                        break;
                    }

                    case token_type::literal_false:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::literal_null:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::literal_true:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_integer:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_string:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::value_unsigned:
                    {
                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
                        {
                            return false;
                        }
                        break;
                    }

                    case token_type::parse_error:
                    {
                        // using "uninitialized" to avoid "expected" message
                        return sax->parse_error(m_lexer.get_position(),
                                                m_lexer.get_token_string(),
                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), BasicJsonType()));
                    }

                    default: // the last token was unexpected
                    {
                        return sax->parse_error(m_lexer.get_position(),
                                                m_lexer.get_token_string(),
                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), BasicJsonType()));
                    }
                }
            }
            else
            {
                skip_to_state_evaluation = false;
            }

            // we reached this line after we successfully parsed a value
            if (states.empty())
            {
                // empty stack: we reached the end of the hierarchy: done
                return true;
            }

            if (states.back())  // array
            {
                // comma -> next value
                if (get_token() == token_type::value_separator)
                {
                    // parse a new value
                    get_token();
                    continue;
                }

                // closing ]
                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
                {
                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
                    {
                        return false;
                    }

                    // We are done with this array. Before we can parse a
                    // new value, we need to evaluate the new state first.
                    // By setting skip_to_state_evaluation to false, we
                    // are effectively jumping to the beginning of this if.
                    JSON_ASSERT(!states.empty());
                    states.pop_back();
                    skip_to_state_evaluation = true;
                    continue;
                }

                return sax->parse_error(m_lexer.get_position(),
                                        m_lexer.get_token_string(),
                                        parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), BasicJsonType()));
            }

            // states.back() is false -> object

            // comma -> next value
            if (get_token() == token_type::value_separator)
            {
                // parse key
                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
                {
                    return sax->parse_error(m_lexer.get_position(),
                                            m_lexer.get_token_string(),
                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType()));
                }

                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
                {
                    return false;
                }

                // parse separator (:)
                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
                {
                    return sax->parse_error(m_lexer.get_position(),
                                            m_lexer.get_token_string(),
                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType()));
                }

                // parse values
                get_token();
                continue;
            }

            // closing }
            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
            {
                if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
                {
                    return false;
                }

                // We are done with this object. Before we can parse a
                // new value, we need to evaluate the new state first.
                // By setting skip_to_state_evaluation to false, we
                // are effectively jumping to the beginning of this if.
                JSON_ASSERT(!states.empty());
                states.pop_back();
                skip_to_state_evaluation = true;
                continue;
            }

            return sax->parse_error(m_lexer.get_position(),
                                    m_lexer.get_token_string(),
                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), BasicJsonType()));
        }
    }

    /// get next token from lexer
    token_type get_token()
    {
        return last_token = m_lexer.scan();
    }

    std::string exception_message(const token_type expected, const std::string& context)
    {
        std::string error_msg = "syntax error ";

        if (!context.empty())
        {
            error_msg += "while parsing " + context + " ";
        }

        error_msg += "- ";

        if (last_token == token_type::parse_error)
        {
            error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
                         m_lexer.get_token_string() + "'";
        }
        else
        {
            error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
        }

        if (expected != token_type::uninitialized)
        {
            error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
        }

        return error_msg;
    }

  private:
    /// callback function
    const parser_callback_t<BasicJsonType> callback = nullptr;
    /// the type of the last read token
    token_type last_token = token_type::uninitialized;
    /// the lexer
    lexer_t m_lexer;
    /// whether to throw exceptions in case of errors
    const bool allow_exceptions = true;
};

}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/iterators/internal_iterator.hpp>


// #include <nlohmann/detail/iterators/primitive_iterator.hpp>


#include <cstddef> // ptrdiff_t
#include <limits>  // numeric_limits

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{
/*
@brief an iterator for primitive JSON types

This class models an iterator for primitive JSON types (boolean, number,
string). It's only purpose is to allow the iterator/const_iterator classes
to "iterate" over primitive values. Internally, the iterator is modeled by
a `difference_type` variable. Value begin_value (`0`) models the begin,
end_value (`1`) models past the end.
*/
class primitive_iterator_t
{
  private:
    using difference_type = std::ptrdiff_t;
    static constexpr difference_type begin_value = 0;
    static constexpr difference_type end_value = begin_value + 1;

  JSON_PRIVATE_UNLESS_TESTED:
    /// iterator as signed integer type
    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();

  public:
    constexpr difference_type get_value() const noexcept
    {
        return m_it;
    }

    /// set iterator to a defined beginning
    void set_begin() noexcept
    {
        m_it = begin_value;
    }

    /// set iterator to a defined past the end
    void set_end() noexcept
    {
        m_it = end_value;
    }

    /// return whether the iterator can be dereferenced
    constexpr bool is_begin() const noexcept
    {
        return m_it == begin_value;
    }

    /// return whether the iterator is at end
    constexpr bool is_end() const noexcept
    {
        return m_it == end_value;
    }

    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it == rhs.m_it;
    }

    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it < rhs.m_it;
    }

    primitive_iterator_t operator+(difference_type n) noexcept
    {
        auto result = *this;
        result += n;
        return result;
    }

    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
    {
        return lhs.m_it - rhs.m_it;
    }

    primitive_iterator_t& operator++() noexcept
    {
        ++m_it;
        return *this;
    }

    primitive_iterator_t const operator++(int) noexcept // NOLINT(readability-const-return-type)
    {
        auto result = *this;
        ++m_it;
        return result;
    }

    primitive_iterator_t& operator--() noexcept
    {
        --m_it;
        return *this;
    }

    primitive_iterator_t const operator--(int) noexcept // NOLINT(readability-const-return-type)
    {
        auto result = *this;
        --m_it;
        return result;
    }

    primitive_iterator_t& operator+=(difference_type n) noexcept
    {
        m_it += n;
        return *this;
    }

    primitive_iterator_t& operator-=(difference_type n) noexcept
    {
        m_it -= n;
        return *this;
    }
};
}  // namespace detail
}  // namespace nlohmann


namespace nlohmann
{
namespace detail
{
/*!
@brief an iterator value

@note This structure could easily be a union, but MSVC currently does not allow
unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
*/
template<typename BasicJsonType> struct internal_iterator
{
    /// iterator for JSON objects
    typename BasicJsonType::object_t::iterator object_iterator {};
    /// iterator for JSON arrays
    typename BasicJsonType::array_t::iterator array_iterator {};
    /// generic iterator for all other types
    primitive_iterator_t primitive_iterator {};
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/iterators/iter_impl.hpp>


#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
#include <type_traits> // conditional, is_const, remove_const

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/iterators/internal_iterator.hpp>

// #include <nlohmann/detail/iterators/primitive_iterator.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
// forward declare, to be able to friend it later on
template<typename IteratorType> class iteration_proxy;
template<typename IteratorType> class iteration_proxy_value;

/*!
@brief a template for a bidirectional iterator for the @ref basic_json class
This class implements a both iterators (iterator and const_iterator) for the
@ref basic_json class.
@note An iterator is called *initialized* when a pointer to a JSON value has
      been set (e.g., by a constructor or a copy assignment). If the iterator is
      default-constructed, it is *uninitialized* and most methods are undefined.
      **The library uses assertions to detect calls on uninitialized iterators.**
@requirement The class satisfies the following concept requirements:
-
[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
  The iterator that can be moved can be moved in both directions (i.e.
  incremented and decremented).
@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
*/
template<typename BasicJsonType>
class iter_impl
{
    /// the iterator with BasicJsonType of different const-ness
    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
    /// allow basic_json to access private members
    friend other_iter_impl;
    friend BasicJsonType;
    friend iteration_proxy<iter_impl>;
    friend iteration_proxy_value<iter_impl>;

    using object_t = typename BasicJsonType::object_t;
    using array_t = typename BasicJsonType::array_t;
    // make sure BasicJsonType is basic_json or const basic_json
    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
                  "iter_impl only accepts (const) basic_json");

  public:

    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
    /// A user-defined iterator should provide publicly accessible typedefs named
    /// iterator_category, value_type, difference_type, pointer, and reference.
    /// Note that value_type is required to be non-const, even for constant iterators.
    using iterator_category = std::bidirectional_iterator_tag;

    /// the type of the values when the iterator is dereferenced
    using value_type = typename BasicJsonType::value_type;
    /// a type to represent differences between iterators
    using difference_type = typename BasicJsonType::difference_type;
    /// defines a pointer to the type iterated over (value_type)
    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
          typename BasicJsonType::const_pointer,
          typename BasicJsonType::pointer>::type;
    /// defines a reference to the type iterated over (value_type)
    using reference =
        typename std::conditional<std::is_const<BasicJsonType>::value,
        typename BasicJsonType::const_reference,
        typename BasicJsonType::reference>::type;

    iter_impl() = default;
    ~iter_impl() = default;
    iter_impl(iter_impl&&) noexcept = default;
    iter_impl& operator=(iter_impl&&) noexcept = default;

    /*!
    @brief constructor for a given JSON instance
    @param[in] object  pointer to a JSON object for this iterator
    @pre object != nullptr
    @post The iterator is initialized; i.e. `m_object != nullptr`.
    */
    explicit iter_impl(pointer object) noexcept : m_object(object)
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = typename object_t::iterator();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = typename array_t::iterator();
                break;
            }

            default:
            {
                m_it.primitive_iterator = primitive_iterator_t();
                break;
            }
        }
    }

    /*!
    @note The conventional copy constructor and copy assignment are implicitly
          defined. Combined with the following converting constructor and
          assignment, they support: (1) copy from iterator to iterator, (2)
          copy from const iterator to const iterator, and (3) conversion from
          iterator to const iterator. However conversion from const iterator
          to iterator is not defined.
    */

    /*!
    @brief const copy constructor
    @param[in] other const iterator to copy from
    @note This copy constructor had to be defined explicitly to circumvent a bug
          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
          information refer to: https://github.com/nlohmann/json/issues/1608
    */
    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
        : m_object(other.m_object), m_it(other.m_it)
    {}

    /*!
    @brief converting assignment
    @param[in] other const iterator to copy from
    @return const/non-const iterator
    @note It is not checked whether @a other is initialized.
    */
    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
    {
        if (&other != this)
        {
            m_object = other.m_object;
            m_it = other.m_it;
        }
        return *this;
    }

    /*!
    @brief converting constructor
    @param[in] other  non-const iterator to copy from
    @note It is not checked whether @a other is initialized.
    */
    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
        : m_object(other.m_object), m_it(other.m_it)
    {}

    /*!
    @brief converting assignment
    @param[in] other  non-const iterator to copy from
    @return const/non-const iterator
    @note It is not checked whether @a other is initialized.
    */
    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept // NOLINT(cert-oop54-cpp)
    {
        m_object = other.m_object;
        m_it = other.m_it;
        return *this;
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief set the iterator to the first value
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    void set_begin() noexcept
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = m_object->m_value.object->begin();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = m_object->m_value.array->begin();
                break;
            }

            case value_t::null:
            {
                // set to end so begin()==end() is true: null is empty
                m_it.primitive_iterator.set_end();
                break;
            }

            default:
            {
                m_it.primitive_iterator.set_begin();
                break;
            }
        }
    }

    /*!
    @brief set the iterator past the last value
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    void set_end() noexcept
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                m_it.object_iterator = m_object->m_value.object->end();
                break;
            }

            case value_t::array:
            {
                m_it.array_iterator = m_object->m_value.array->end();
                break;
            }

            default:
            {
                m_it.primitive_iterator.set_end();
                break;
            }
        }
    }

  public:
    /*!
    @brief return a reference to the value pointed to by the iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference operator*() const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
                return m_it.object_iterator->second;
            }

            case value_t::array:
            {
                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
                return *m_it.array_iterator;
            }

            case value_t::null:
                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));

            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
                {
                    return *m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
            }
        }
    }

    /*!
    @brief dereference the iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    pointer operator->() const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
                return &(m_it.object_iterator->second);
            }

            case value_t::array:
            {
                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
                return &*m_it.array_iterator;
            }

            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
                {
                    return m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
            }
        }
    }

    /*!
    @brief post-increment (it++)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl const operator++(int) // NOLINT(readability-const-return-type)
    {
        auto result = *this;
        ++(*this);
        return result;
    }

    /*!
    @brief pre-increment (++it)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator++()
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                std::advance(m_it.object_iterator, 1);
                break;
            }

            case value_t::array:
            {
                std::advance(m_it.array_iterator, 1);
                break;
            }

            default:
            {
                ++m_it.primitive_iterator;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief post-decrement (it--)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl const operator--(int) // NOLINT(readability-const-return-type)
    {
        auto result = *this;
        --(*this);
        return result;
    }

    /*!
    @brief pre-decrement (--it)
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator--()
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
            {
                std::advance(m_it.object_iterator, -1);
                break;
            }

            case value_t::array:
            {
                std::advance(m_it.array_iterator, -1);
                break;
            }

            default:
            {
                --m_it.primitive_iterator;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief comparison: equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
    bool operator==(const IterImpl& other) const
    {
        // if objects are not the same, the comparison is undefined
        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
        {
            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object));
        }

        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
                return (m_it.object_iterator == other.m_it.object_iterator);

            case value_t::array:
                return (m_it.array_iterator == other.m_it.array_iterator);

            default:
                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
        }
    }

    /*!
    @brief comparison: not equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
    bool operator!=(const IterImpl& other) const
    {
        return !operator==(other);
    }

    /*!
    @brief comparison: smaller
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator<(const iter_impl& other) const
    {
        // if objects are not the same, the comparison is undefined
        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
        {
            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object));
        }

        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", *m_object));

            case value_t::array:
                return (m_it.array_iterator < other.m_it.array_iterator);

            default:
                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
        }
    }

    /*!
    @brief comparison: less than or equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator<=(const iter_impl& other) const
    {
        return !other.operator < (*this);
    }

    /*!
    @brief comparison: greater than
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator>(const iter_impl& other) const
    {
        return !operator<=(other);
    }

    /*!
    @brief comparison: greater than or equal
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    bool operator>=(const iter_impl& other) const
    {
        return !operator<(other);
    }

    /*!
    @brief add to iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator+=(difference_type i)
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object));

            case value_t::array:
            {
                std::advance(m_it.array_iterator, i);
                break;
            }

            default:
            {
                m_it.primitive_iterator += i;
                break;
            }
        }

        return *this;
    }

    /*!
    @brief subtract from iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl& operator-=(difference_type i)
    {
        return operator+=(-i);
    }

    /*!
    @brief add to iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator+(difference_type i) const
    {
        auto result = *this;
        result += i;
        return result;
    }

    /*!
    @brief addition of distance and iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    friend iter_impl operator+(difference_type i, const iter_impl& it)
    {
        auto result = it;
        result += i;
        return result;
    }

    /*!
    @brief subtract from iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    iter_impl operator-(difference_type i) const
    {
        auto result = *this;
        result -= i;
        return result;
    }

    /*!
    @brief return difference
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    difference_type operator-(const iter_impl& other) const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object));

            case value_t::array:
                return m_it.array_iterator - other.m_it.array_iterator;

            default:
                return m_it.primitive_iterator - other.m_it.primitive_iterator;
        }
    }

    /*!
    @brief access to successor
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference operator[](difference_type n) const
    {
        JSON_ASSERT(m_object != nullptr);

        switch (m_object->m_type)
        {
            case value_t::object:
                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", *m_object));

            case value_t::array:
                return *std::next(m_it.array_iterator, n);

            case value_t::null:
                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));

            default:
            {
                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
                {
                    return *m_object;
                }

                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
            }
        }
    }

    /*!
    @brief return the key of an object iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    const typename object_t::key_type& key() const
    {
        JSON_ASSERT(m_object != nullptr);

        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
        {
            return m_it.object_iterator->first;
        }

        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", *m_object));
    }

    /*!
    @brief return the value of an iterator
    @pre The iterator is initialized; i.e. `m_object != nullptr`.
    */
    reference value() const
    {
        return operator*();
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /// associated JSON instance
    pointer m_object = nullptr;
    /// the actual iterator of the associated instance
    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
};
} // namespace detail
} // namespace nlohmann

// #include <nlohmann/detail/iterators/iteration_proxy.hpp>

// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>


#include <cstddef> // ptrdiff_t
#include <iterator> // reverse_iterator
#include <utility> // declval

namespace nlohmann
{
namespace detail
{
//////////////////////
// reverse_iterator //
//////////////////////

/*!
@brief a template for a reverse iterator class

@tparam Base the base iterator type to reverse. Valid types are @ref
iterator (to create @ref reverse_iterator) and @ref const_iterator (to
create @ref const_reverse_iterator).

@requirement The class satisfies the following concept requirements:
-
[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
  The iterator that can be moved can be moved in both directions (i.e.
  incremented and decremented).
- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
  It is possible to write to the pointed-to element (only if @a Base is
  @ref iterator).

@since version 1.0.0
*/
template<typename Base>
class json_reverse_iterator : public std::reverse_iterator<Base>
{
  public:
    using difference_type = std::ptrdiff_t;
    /// shortcut to the reverse iterator adapter
    using base_iterator = std::reverse_iterator<Base>;
    /// the reference type for the pointed-to element
    using reference = typename Base::reference;

    /// create reverse iterator from iterator
    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
        : base_iterator(it) {}

    /// create reverse iterator from base class
    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}

    /// post-increment (it++)
    json_reverse_iterator const operator++(int) // NOLINT(readability-const-return-type)
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
    }

    /// pre-increment (++it)
    json_reverse_iterator& operator++()
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
    }

    /// post-decrement (it--)
    json_reverse_iterator const operator--(int) // NOLINT(readability-const-return-type)
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
    }

    /// pre-decrement (--it)
    json_reverse_iterator& operator--()
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
    }

    /// add to iterator
    json_reverse_iterator& operator+=(difference_type i)
    {
        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
    }

    /// add to iterator
    json_reverse_iterator operator+(difference_type i) const
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
    }

    /// subtract from iterator
    json_reverse_iterator operator-(difference_type i) const
    {
        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
    }

    /// return difference
    difference_type operator-(const json_reverse_iterator& other) const
    {
        return base_iterator(*this) - base_iterator(other);
    }

    /// access to successor
    reference operator[](difference_type n) const
    {
        return *(this->operator+(n));
    }

    /// return the key of an object iterator
    auto key() const -> decltype(std::declval<Base>().key())
    {
        auto it = --this->base();
        return it.key();
    }

    /// return the value of an iterator
    reference value() const
    {
        auto it = --this->base();
        return it.operator * ();
    }
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/iterators/primitive_iterator.hpp>

// #include <nlohmann/detail/json_pointer.hpp>


#include <algorithm> // all_of
#include <cctype> // isdigit
#include <limits> // max
#include <numeric> // accumulate
#include <string> // string
#include <utility> // move
#include <vector> // vector

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/string_escape.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
template<typename BasicJsonType>
class json_pointer
{
    // allow basic_json to access private members
    NLOHMANN_BASIC_JSON_TPL_DECLARATION
    friend class basic_json;

  public:
    /*!
    @brief create JSON pointer

    Create a JSON pointer according to the syntax described in
    [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3).

    @param[in] s  string representing the JSON pointer; if omitted, the empty
                  string is assumed which references the whole JSON value

    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
                           not begin with a slash (`/`); see example below

    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
    not followed by `0` (representing `~`) or `1` (representing `/`); see
    example below

    @liveexample{The example shows the construction several valid JSON pointers
    as well as the exceptional behavior.,json_pointer}

    @since version 2.0.0
    */
    explicit json_pointer(const std::string& s = "")
        : reference_tokens(split(s))
    {}

    /*!
    @brief return a string representation of the JSON pointer

    @invariant For each JSON pointer `ptr`, it holds:
    @code {.cpp}
    ptr == json_pointer(ptr.to_string());
    @endcode

    @return a string representation of the JSON pointer

    @liveexample{The example shows the result of `to_string`.,json_pointer__to_string}

    @since version 2.0.0
    */
    std::string to_string() const
    {
        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
                               std::string{},
                               [](const std::string & a, const std::string & b)
        {
            return a + "/" + detail::escape(b);
        });
    }

    /// @copydoc to_string()
    operator std::string() const
    {
        return to_string();
    }

    /*!
    @brief append another JSON pointer at the end of this JSON pointer

    @param[in] ptr  JSON pointer to append
    @return JSON pointer with @a ptr appended

    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}

    @complexity Linear in the length of @a ptr.

    @sa see @ref operator/=(std::string) to append a reference token
    @sa see @ref operator/=(std::size_t) to append an array index
    @sa see @ref operator/(const json_pointer&, const json_pointer&) for a binary operator

    @since version 3.6.0
    */
    json_pointer& operator/=(const json_pointer& ptr)
    {
        reference_tokens.insert(reference_tokens.end(),
                                ptr.reference_tokens.begin(),
                                ptr.reference_tokens.end());
        return *this;
    }

    /*!
    @brief append an unescaped reference token at the end of this JSON pointer

    @param[in] token  reference token to append
    @return JSON pointer with @a token appended without escaping @a token

    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}

    @complexity Amortized constant.

    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer
    @sa see @ref operator/=(std::size_t) to append an array index
    @sa see @ref operator/(const json_pointer&, std::size_t) for a binary operator

    @since version 3.6.0
    */
    json_pointer& operator/=(std::string token)
    {
        push_back(std::move(token));
        return *this;
    }

    /*!
    @brief append an array index at the end of this JSON pointer

    @param[in] array_idx  array index to append
    @return JSON pointer with @a array_idx appended

    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}

    @complexity Amortized constant.

    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer
    @sa see @ref operator/=(std::string) to append a reference token
    @sa see @ref operator/(const json_pointer&, std::string) for a binary operator

    @since version 3.6.0
    */
    json_pointer& operator/=(std::size_t array_idx)
    {
        return *this /= std::to_string(array_idx);
    }

    /*!
    @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer

    @param[in] lhs  JSON pointer
    @param[in] rhs  JSON pointer
    @return a new JSON pointer with @a rhs appended to @a lhs

    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}

    @complexity Linear in the length of @a lhs and @a rhs.

    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer

    @since version 3.6.0
    */
    friend json_pointer operator/(const json_pointer& lhs,
                                  const json_pointer& rhs)
    {
        return json_pointer(lhs) /= rhs;
    }

    /*!
    @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer

    @param[in] ptr  JSON pointer
    @param[in] token  reference token
    @return a new JSON pointer with unescaped @a token appended to @a ptr

    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}

    @complexity Linear in the length of @a ptr.

    @sa see @ref operator/=(std::string) to append a reference token

    @since version 3.6.0
    */
    friend json_pointer operator/(const json_pointer& ptr, std::string token) // NOLINT(performance-unnecessary-value-param)
    {
        return json_pointer(ptr) /= std::move(token);
    }

    /*!
    @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer

    @param[in] ptr  JSON pointer
    @param[in] array_idx  array index
    @return a new JSON pointer with @a array_idx appended to @a ptr

    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}

    @complexity Linear in the length of @a ptr.

    @sa see @ref operator/=(std::size_t) to append an array index

    @since version 3.6.0
    */
    friend json_pointer operator/(const json_pointer& ptr, std::size_t array_idx)
    {
        return json_pointer(ptr) /= array_idx;
    }

    /*!
    @brief returns the parent of this JSON pointer

    @return parent of this JSON pointer; in case this JSON pointer is the root,
            the root itself is returned

    @complexity Linear in the length of the JSON pointer.

    @liveexample{The example shows the result of `parent_pointer` for different
    JSON Pointers.,json_pointer__parent_pointer}

    @since version 3.6.0
    */
    json_pointer parent_pointer() const
    {
        if (empty())
        {
            return *this;
        }

        json_pointer res = *this;
        res.pop_back();
        return res;
    }

    /*!
    @brief remove last reference token

    @pre not `empty()`

    @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back}

    @complexity Constant.

    @throw out_of_range.405 if JSON pointer has no parent

    @since version 3.6.0
    */
    void pop_back()
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
        }

        reference_tokens.pop_back();
    }

    /*!
    @brief return last reference token

    @pre not `empty()`
    @return last reference token

    @liveexample{The example shows the usage of `back`.,json_pointer__back}

    @complexity Constant.

    @throw out_of_range.405 if JSON pointer has no parent

    @since version 3.6.0
    */
    const std::string& back() const
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
        }

        return reference_tokens.back();
    }

    /*!
    @brief append an unescaped token at the end of the reference pointer

    @param[in] token  token to add

    @complexity Amortized constant.

    @liveexample{The example shows the result of `push_back` for different
    JSON Pointers.,json_pointer__push_back}

    @since version 3.6.0
    */
    void push_back(const std::string& token)
    {
        reference_tokens.push_back(token);
    }

    /// @copydoc push_back(const std::string&)
    void push_back(std::string&& token)
    {
        reference_tokens.push_back(std::move(token));
    }

    /*!
    @brief return whether pointer points to the root document

    @return true iff the JSON pointer points to the root document

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example shows the result of `empty` for different JSON
    Pointers.,json_pointer__empty}

    @since version 3.6.0
    */
    bool empty() const noexcept
    {
        return reference_tokens.empty();
    }

  private:
    /*!
    @param[in] s  reference token to be converted into an array index

    @return integer representation of @a s

    @throw parse_error.106  if an array index begins with '0'
    @throw parse_error.109  if an array index begins not with a digit
    @throw out_of_range.404 if string @a s could not be converted to an integer
    @throw out_of_range.410 if an array index exceeds size_type
    */
    static typename BasicJsonType::size_type array_index(const std::string& s)
    {
        using size_type = typename BasicJsonType::size_type;

        // error condition (cf. RFC 6901, Sect. 4)
        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
        {
            JSON_THROW(detail::parse_error::create(106, 0, "array index '" + s + "' must not begin with '0'", BasicJsonType()));
        }

        // error condition (cf. RFC 6901, Sect. 4)
        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
        {
            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + s + "' is not a number", BasicJsonType()));
        }

        std::size_t processed_chars = 0;
        unsigned long long res = 0;  // NOLINT(runtime/int)
        JSON_TRY
        {
            res = std::stoull(s, &processed_chars);
        }
        JSON_CATCH(std::out_of_range&)
        {
            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType()));
        }

        // check if the string was completely read
        if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size()))
        {
            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType()));
        }

        // only triggered on special platforms (like 32bit), see also
        // https://github.com/nlohmann/json/pull/2203
        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
        {
            JSON_THROW(detail::out_of_range::create(410, "array index " + s + " exceeds size_type", BasicJsonType())); // LCOV_EXCL_LINE
        }

        return static_cast<size_type>(res);
    }

  JSON_PRIVATE_UNLESS_TESTED:
    json_pointer top() const
    {
        if (JSON_HEDLEY_UNLIKELY(empty()))
        {
            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
        }

        json_pointer result = *this;
        result.reference_tokens = {reference_tokens[0]};
        return result;
    }

  private:
    /*!
    @brief create and return a reference to the pointed to value

    @complexity Linear in the number of reference tokens.

    @throw parse_error.109 if array index is not a number
    @throw type_error.313 if value cannot be unflattened
    */
    BasicJsonType& get_and_create(BasicJsonType& j) const
    {
        auto* result = &j;

        // in case no reference tokens exist, return a reference to the JSON value
        // j which will be overwritten by a primitive value
        for (const auto& reference_token : reference_tokens)
        {
            switch (result->type())
            {
                case detail::value_t::null:
                {
                    if (reference_token == "0")
                    {
                        // start a new array if reference token is 0
                        result = &result->operator[](0);
                    }
                    else
                    {
                        // start a new object otherwise
                        result = &result->operator[](reference_token);
                    }
                    break;
                }

                case detail::value_t::object:
                {
                    // create an entry in the object
                    result = &result->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    // create an entry in the array
                    result = &result->operator[](array_index(reference_token));
                    break;
                }

                /*
                The following code is only reached if there exists a reference
                token _and_ the current value is primitive. In this case, we have
                an error situation, because primitive values may only occur as
                single value; that is, with an empty list of reference tokens.
                */
                default:
                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", j));
            }
        }

        return *result;
    }

    /*!
    @brief return a reference to the pointed to value

    @note This version does not throw if a value is not present, but tries to
          create nested values instead. For instance, calling this function
          with pointer `"/this/that"` on a null value is equivalent to calling
          `operator[]("this").operator[]("that")` on that value, effectively
          changing the null value to an object.

    @param[in] ptr  a JSON value

    @return reference to the JSON value pointed to by the JSON pointer

    @complexity Linear in the length of the JSON pointer.

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            // convert null values to arrays or objects before continuing
            if (ptr->is_null())
            {
                // check if reference token is a number
                const bool nums =
                    std::all_of(reference_token.begin(), reference_token.end(),
                                [](const unsigned char x)
                {
                    return std::isdigit(x);
                });

                // change value to array for numbers or "-" or to object otherwise
                *ptr = (nums || reference_token == "-")
                       ? detail::value_t::array
                       : detail::value_t::object;
            }

            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // use unchecked object access
                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (reference_token == "-")
                    {
                        // explicitly treat "-" as index beyond the end
                        ptr = &ptr->operator[](ptr->m_value.array->size());
                    }
                    else
                    {
                        // convert array index to number; unchecked access
                        ptr = &ptr->operator[](array_index(reference_token));
                    }
                    break;
                }

                default:
                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    BasicJsonType& get_checked(BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // note: at performs range check
                    ptr = &ptr->at(reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        JSON_THROW(detail::out_of_range::create(402,
                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
                                                                ") is out of range", *ptr));
                    }

                    // note: at performs range check
                    ptr = &ptr->at(array_index(reference_token));
                    break;
                }

                default:
                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
            }
        }

        return *ptr;
    }

    /*!
    @brief return a const reference to the pointed to value

    @param[in] ptr  a JSON value

    @return const reference to the JSON value pointed to by the JSON
    pointer

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // use unchecked object access
                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" cannot be used for const access
                        JSON_THROW(detail::out_of_range::create(402, "array index '-' (" + std::to_string(ptr->m_value.array->size()) + ") is out of range", *ptr));
                    }

                    // use unchecked array access
                    ptr = &ptr->operator[](array_index(reference_token));
                    break;
                }

                default:
                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved
    */
    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    // note: at performs range check
                    ptr = &ptr->at(reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        JSON_THROW(detail::out_of_range::create(402,
                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
                                                                ") is out of range", *ptr));
                    }

                    // note: at performs range check
                    ptr = &ptr->at(array_index(reference_token));
                    break;
                }

                default:
                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
            }
        }

        return *ptr;
    }

    /*!
    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    */
    bool contains(const BasicJsonType* ptr) const
    {
        for (const auto& reference_token : reference_tokens)
        {
            switch (ptr->type())
            {
                case detail::value_t::object:
                {
                    if (!ptr->contains(reference_token))
                    {
                        // we did not find the key in the object
                        return false;
                    }

                    ptr = &ptr->operator[](reference_token);
                    break;
                }

                case detail::value_t::array:
                {
                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
                    {
                        // "-" always fails the range check
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
                    {
                        // invalid char
                        return false;
                    }
                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
                    {
                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
                        {
                            // first char should be between '1' and '9'
                            return false;
                        }
                        for (std::size_t i = 1; i < reference_token.size(); i++)
                        {
                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
                            {
                                // other char should be between '0' and '9'
                                return false;
                            }
                        }
                    }

                    const auto idx = array_index(reference_token);
                    if (idx >= ptr->size())
                    {
                        // index out of range
                        return false;
                    }

                    ptr = &ptr->operator[](idx);
                    break;
                }

                default:
                {
                    // we do not expect primitive values if there is still a
                    // reference token to process
                    return false;
                }
            }
        }

        // no reference token left means we found a primitive value
        return true;
    }

    /*!
    @brief split the string input to reference tokens

    @note This function is only called by the json_pointer constructor.
          All exceptions below are documented there.

    @throw parse_error.107  if the pointer is not empty or begins with '/'
    @throw parse_error.108  if character '~' is not followed by '0' or '1'
    */
    static std::vector<std::string> split(const std::string& reference_string)
    {
        std::vector<std::string> result;

        // special case: empty reference string -> no reference tokens
        if (reference_string.empty())
        {
            return result;
        }

        // check if nonempty reference string begins with slash
        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
        {
            JSON_THROW(detail::parse_error::create(107, 1, "JSON pointer must be empty or begin with '/' - was: '" + reference_string + "'", BasicJsonType()));
        }

        // extract the reference tokens:
        // - slash: position of the last read slash (or end of string)
        // - start: position after the previous slash
        for (
            // search for the first slash after the first character
            std::size_t slash = reference_string.find_first_of('/', 1),
            // set the beginning of the first reference token
            start = 1;
            // we can stop if start == 0 (if slash == std::string::npos)
            start != 0;
            // set the beginning of the next reference token
            // (will eventually be 0 if slash == std::string::npos)
            start = (slash == std::string::npos) ? 0 : slash + 1,
            // find next slash
            slash = reference_string.find_first_of('/', start))
        {
            // use the text between the beginning of the reference token
            // (start) and the last slash (slash).
            auto reference_token = reference_string.substr(start, slash - start);

            // check reference tokens are properly escaped
            for (std::size_t pos = reference_token.find_first_of('~');
                    pos != std::string::npos;
                    pos = reference_token.find_first_of('~', pos + 1))
            {
                JSON_ASSERT(reference_token[pos] == '~');

                // ~ must be followed by 0 or 1
                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
                                         (reference_token[pos + 1] != '0' &&
                                          reference_token[pos + 1] != '1')))
                {
                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", BasicJsonType()));
                }
            }

            // finally, store the reference token
            detail::unescape(reference_token);
            result.push_back(reference_token);
        }

        return result;
    }

  private:
    /*!
    @param[in] reference_string  the reference string to the current value
    @param[in] value             the value to consider
    @param[in,out] result        the result object to insert values to

    @note Empty objects or arrays are flattened to `null`.
    */
    static void flatten(const std::string& reference_string,
                        const BasicJsonType& value,
                        BasicJsonType& result)
    {
        switch (value.type())
        {
            case detail::value_t::array:
            {
                if (value.m_value.array->empty())
                {
                    // flatten empty array as null
                    result[reference_string] = nullptr;
                }
                else
                {
                    // iterate array and use index as reference string
                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
                    {
                        flatten(reference_string + "/" + std::to_string(i),
                                value.m_value.array->operator[](i), result);
                    }
                }
                break;
            }

            case detail::value_t::object:
            {
                if (value.m_value.object->empty())
                {
                    // flatten empty object as null
                    result[reference_string] = nullptr;
                }
                else
                {
                    // iterate object and use keys as reference string
                    for (const auto& element : *value.m_value.object)
                    {
                        flatten(reference_string + "/" + detail::escape(element.first), element.second, result);
                    }
                }
                break;
            }

            default:
            {
                // add primitive value with its reference string
                result[reference_string] = value;
                break;
            }
        }
    }

    /*!
    @param[in] value  flattened JSON

    @return unflattened JSON

    @throw parse_error.109 if array index is not a number
    @throw type_error.314  if value is not an object
    @throw type_error.315  if object values are not primitive
    @throw type_error.313  if value cannot be unflattened
    */
    static BasicJsonType
    unflatten(const BasicJsonType& value)
    {
        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
        {
            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", value));
        }

        BasicJsonType result;

        // iterate the JSON object values
        for (const auto& element : *value.m_value.object)
        {
            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
            {
                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", element.second));
            }

            // assign value to reference pointed to by JSON pointer; Note that if
            // the JSON pointer is "" (i.e., points to the whole value), function
            // get_and_create returns a reference to result itself. An assignment
            // will then create a primitive value.
            json_pointer(element.first).get_and_create(result) = element.second;
        }

        return result;
    }

    /*!
    @brief compares two JSON pointers for equality

    @param[in] lhs  JSON pointer to compare
    @param[in] rhs  JSON pointer to compare
    @return whether @a lhs is equal to @a rhs

    @complexity Linear in the length of the JSON pointer

    @exceptionsafety No-throw guarantee: this function never throws exceptions.
    */
    friend bool operator==(json_pointer const& lhs,
                           json_pointer const& rhs) noexcept
    {
        return lhs.reference_tokens == rhs.reference_tokens;
    }

    /*!
    @brief compares two JSON pointers for inequality

    @param[in] lhs  JSON pointer to compare
    @param[in] rhs  JSON pointer to compare
    @return whether @a lhs is not equal @a rhs

    @complexity Linear in the length of the JSON pointer

    @exceptionsafety No-throw guarantee: this function never throws exceptions.
    */
    friend bool operator!=(json_pointer const& lhs,
                           json_pointer const& rhs) noexcept
    {
        return !(lhs == rhs);
    }

    /// the reference tokens
    std::vector<std::string> reference_tokens;
};
}  // namespace nlohmann

// #include <nlohmann/detail/json_ref.hpp>


#include <initializer_list>
#include <utility>

// #include <nlohmann/detail/meta/type_traits.hpp>


namespace nlohmann
{
namespace detail
{
template<typename BasicJsonType>
class json_ref
{
  public:
    using value_type = BasicJsonType;

    json_ref(value_type&& value)
        : owned_value(std::move(value))
    {}

    json_ref(const value_type& value)
        : value_ref(&value)
    {}

    json_ref(std::initializer_list<json_ref> init)
        : owned_value(init)
    {}

    template <
        class... Args,
        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
    json_ref(Args && ... args)
        : owned_value(std::forward<Args>(args)...)
    {}

    // class should be movable only
    json_ref(json_ref&&) noexcept = default;
    json_ref(const json_ref&) = delete;
    json_ref& operator=(const json_ref&) = delete;
    json_ref& operator=(json_ref&&) = delete;
    ~json_ref() = default;

    value_type moved_or_copied() const
    {
        if (value_ref == nullptr)
        {
            return std::move(owned_value);
        }
        return *value_ref;
    }

    value_type const& operator*() const
    {
        return value_ref ? *value_ref : owned_value;
    }

    value_type const* operator->() const
    {
        return &** this;
    }

  private:
    mutable value_type owned_value = nullptr;
    value_type const* value_ref = nullptr;
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/string_escape.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/meta/type_traits.hpp>

// #include <nlohmann/detail/output/binary_writer.hpp>


#include <algorithm> // reverse
#include <array> // array
#include <cmath> // isnan, isinf
#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
#include <cstring> // memcpy
#include <limits> // numeric_limits
#include <string> // string
#include <utility> // move

// #include <nlohmann/detail/input/binary_reader.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/output/output_adapters.hpp>


#include <algorithm> // copy
#include <cstddef> // size_t
#include <ios> // streamsize
#include <iterator> // back_inserter
#include <memory> // shared_ptr, make_shared
#include <ostream> // basic_ostream
#include <string> // basic_string
#include <vector> // vector
// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{
/// abstract output adapter interface
template<typename CharType> struct output_adapter_protocol
{
    virtual void write_character(CharType c) = 0;
    virtual void write_characters(const CharType* s, std::size_t length) = 0;
    virtual ~output_adapter_protocol() = default;

    output_adapter_protocol() = default;
    output_adapter_protocol(const output_adapter_protocol&) = default;
    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
    output_adapter_protocol& operator=(const output_adapter_protocol&) = default;
    output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default;
};

/// a type to simplify interfaces
template<typename CharType>
using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;

/// output adapter for byte vectors
template<typename CharType>
class output_vector_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_vector_adapter(std::vector<CharType>& vec) noexcept
        : v(vec)
    {}

    void write_character(CharType c) override
    {
        v.push_back(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        std::copy(s, s + length, std::back_inserter(v));
    }

  private:
    std::vector<CharType>& v;
};

/// output adapter for output streams
template<typename CharType>
class output_stream_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
        : stream(s)
    {}

    void write_character(CharType c) override
    {
        stream.put(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        stream.write(s, static_cast<std::streamsize>(length));
    }

  private:
    std::basic_ostream<CharType>& stream;
};

/// output adapter for basic_string
template<typename CharType, typename StringType = std::basic_string<CharType>>
class output_string_adapter : public output_adapter_protocol<CharType>
{
  public:
    explicit output_string_adapter(StringType& s) noexcept
        : str(s)
    {}

    void write_character(CharType c) override
    {
        str.push_back(c);
    }

    JSON_HEDLEY_NON_NULL(2)
    void write_characters(const CharType* s, std::size_t length) override
    {
        str.append(s, length);
    }

  private:
    StringType& str;
};

template<typename CharType, typename StringType = std::basic_string<CharType>>
class output_adapter
{
  public:
    output_adapter(std::vector<CharType>& vec)
        : oa(std::make_shared<output_vector_adapter<CharType>>(vec)) {}

    output_adapter(std::basic_ostream<CharType>& s)
        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}

    output_adapter(StringType& s)
        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}

    operator output_adapter_t<CharType>()
    {
        return oa;
    }

  private:
    output_adapter_t<CharType> oa = nullptr;
};
}  // namespace detail
}  // namespace nlohmann


namespace nlohmann
{
namespace detail
{
///////////////////
// binary writer //
///////////////////

/*!
@brief serialization to CBOR and MessagePack values
*/
template<typename BasicJsonType, typename CharType>
class binary_writer
{
    using string_t = typename BasicJsonType::string_t;
    using binary_t = typename BasicJsonType::binary_t;
    using number_float_t = typename BasicJsonType::number_float_t;

  public:
    /*!
    @brief create a binary writer

    @param[in] adapter  output adapter to write to
    */
    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter))
    {
        JSON_ASSERT(oa);
    }

    /*!
    @param[in] j  JSON value to serialize
    @pre       j.type() == value_t::object
    */
    void write_bson(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::object:
            {
                write_bson_object(*j.m_value.object);
                break;
            }

            default:
            {
                JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name()), j));;
            }
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    */
    void write_cbor(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::null:
            {
                oa->write_character(to_char_type(0xF6));
                break;
            }

            case value_t::boolean:
            {
                oa->write_character(j.m_value.boolean
                                    ? to_char_type(0xF5)
                                    : to_char_type(0xF4));
                break;
            }

            case value_t::number_integer:
            {
                if (j.m_value.number_integer >= 0)
                {
                    // CBOR does not differentiate between positive signed
                    // integers and unsigned integers. Therefore, we used the
                    // code from the value_t::number_unsigned case here.
                    if (j.m_value.number_integer <= 0x17)
                    {
                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        oa->write_character(to_char_type(0x18));
                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        oa->write_character(to_char_type(0x19));
                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        oa->write_character(to_char_type(0x1A));
                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
                    }
                    else
                    {
                        oa->write_character(to_char_type(0x1B));
                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
                    }
                }
                else
                {
                    // The conversions below encode the sign in the first
                    // byte, and the value is converted to a positive number.
                    const auto positive_number = -1 - j.m_value.number_integer;
                    if (j.m_value.number_integer >= -24)
                    {
                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        oa->write_character(to_char_type(0x38));
                        write_number(static_cast<std::uint8_t>(positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        oa->write_character(to_char_type(0x39));
                        write_number(static_cast<std::uint16_t>(positive_number));
                    }
                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        oa->write_character(to_char_type(0x3A));
                        write_number(static_cast<std::uint32_t>(positive_number));
                    }
                    else
                    {
                        oa->write_character(to_char_type(0x3B));
                        write_number(static_cast<std::uint64_t>(positive_number));
                    }
                }
                break;
            }

            case value_t::number_unsigned:
            {
                if (j.m_value.number_unsigned <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x18));
                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x19));
                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x1A));
                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
                }
                else
                {
                    oa->write_character(to_char_type(0x1B));
                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
                }
                break;
            }

            case value_t::number_float:
            {
                if (std::isnan(j.m_value.number_float))
                {
                    // NaN is 0xf97e00 in CBOR
                    oa->write_character(to_char_type(0xF9));
                    oa->write_character(to_char_type(0x7E));
                    oa->write_character(to_char_type(0x00));
                }
                else if (std::isinf(j.m_value.number_float))
                {
                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
                    oa->write_character(to_char_type(0xf9));
                    oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
                    oa->write_character(to_char_type(0x00));
                }
                else
                {
                    write_compact_float(j.m_value.number_float, detail::input_format_t::cbor);
                }
                break;
            }

            case value_t::string:
            {
                // step 1: write control byte and the string length
                const auto N = j.m_value.string->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x60 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x78));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x79));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x7A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x7B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write the string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
                    j.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                // step 1: write control byte and the array size
                const auto N = j.m_value.array->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x80 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x98));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x99));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x9A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x9B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                for (const auto& el : *j.m_value.array)
                {
                    write_cbor(el);
                }
                break;
            }

            case value_t::binary:
            {
                if (j.m_value.binary->has_subtype())
                {
                    write_number(static_cast<std::uint8_t>(0xd8));
                    write_number(j.m_value.binary->subtype());
                }

                // step 1: write control byte and the binary array size
                const auto N = j.m_value.binary->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0x40 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0x58));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0x59));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0x5A));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0x5B));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
                    N);

                break;
            }

            case value_t::object:
            {
                // step 1: write control byte and the object size
                const auto N = j.m_value.object->size();
                if (N <= 0x17)
                {
                    write_number(static_cast<std::uint8_t>(0xA0 + N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    oa->write_character(to_char_type(0xB8));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    oa->write_character(to_char_type(0xB9));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    oa->write_character(to_char_type(0xBA));
                    write_number(static_cast<std::uint32_t>(N));
                }
                // LCOV_EXCL_START
                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    oa->write_character(to_char_type(0xBB));
                    write_number(static_cast<std::uint64_t>(N));
                }
                // LCOV_EXCL_STOP

                // step 2: write each element
                for (const auto& el : *j.m_value.object)
                {
                    write_cbor(el.first);
                    write_cbor(el.second);
                }
                break;
            }

            default:
                break;
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    */
    void write_msgpack(const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::null: // nil
            {
                oa->write_character(to_char_type(0xC0));
                break;
            }

            case value_t::boolean: // true and false
            {
                oa->write_character(j.m_value.boolean
                                    ? to_char_type(0xC3)
                                    : to_char_type(0xC2));
                break;
            }

            case value_t::number_integer:
            {
                if (j.m_value.number_integer >= 0)
                {
                    // MessagePack does not differentiate between positive
                    // signed integers and unsigned integers. Therefore, we used
                    // the code from the value_t::number_unsigned case here.
                    if (j.m_value.number_unsigned < 128)
                    {
                        // positive fixnum
                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                    {
                        // uint 8
                        oa->write_character(to_char_type(0xCC));
                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                    {
                        // uint 16
                        oa->write_character(to_char_type(0xCD));
                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                    {
                        // uint 32
                        oa->write_character(to_char_type(0xCE));
                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                    {
                        // uint 64
                        oa->write_character(to_char_type(0xCF));
                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
                    }
                }
                else
                {
                    if (j.m_value.number_integer >= -32)
                    {
                        // negative fixnum
                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                    {
                        // int 8
                        oa->write_character(to_char_type(0xD0));
                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                    {
                        // int 16
                        oa->write_character(to_char_type(0xD1));
                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                    {
                        // int 32
                        oa->write_character(to_char_type(0xD2));
                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
                    }
                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                    {
                        // int 64
                        oa->write_character(to_char_type(0xD3));
                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
                    }
                }
                break;
            }

            case value_t::number_unsigned:
            {
                if (j.m_value.number_unsigned < 128)
                {
                    // positive fixnum
                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    // uint 8
                    oa->write_character(to_char_type(0xCC));
                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // uint 16
                    oa->write_character(to_char_type(0xCD));
                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // uint 32
                    oa->write_character(to_char_type(0xCE));
                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
                }
                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
                {
                    // uint 64
                    oa->write_character(to_char_type(0xCF));
                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
                }
                break;
            }

            case value_t::number_float:
            {
                write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack);
                break;
            }

            case value_t::string:
            {
                // step 1: write control byte and the string length
                const auto N = j.m_value.string->size();
                if (N <= 31)
                {
                    // fixstr
                    write_number(static_cast<std::uint8_t>(0xA0 | N));
                }
                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    // str 8
                    oa->write_character(to_char_type(0xD9));
                    write_number(static_cast<std::uint8_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // str 16
                    oa->write_character(to_char_type(0xDA));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // str 32
                    oa->write_character(to_char_type(0xDB));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write the string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
                    j.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                // step 1: write control byte and the array size
                const auto N = j.m_value.array->size();
                if (N <= 15)
                {
                    // fixarray
                    write_number(static_cast<std::uint8_t>(0x90 | N));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // array 16
                    oa->write_character(to_char_type(0xDC));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // array 32
                    oa->write_character(to_char_type(0xDD));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write each element
                for (const auto& el : *j.m_value.array)
                {
                    write_msgpack(el);
                }
                break;
            }

            case value_t::binary:
            {
                // step 0: determine if the binary type has a set subtype to
                // determine whether or not to use the ext or fixext types
                const bool use_ext = j.m_value.binary->has_subtype();

                // step 1: write control byte and the byte string length
                const auto N = j.m_value.binary->size();
                if (N <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    std::uint8_t output_type{};
                    bool fixed = true;
                    if (use_ext)
                    {
                        switch (N)
                        {
                            case 1:
                                output_type = 0xD4; // fixext 1
                                break;
                            case 2:
                                output_type = 0xD5; // fixext 2
                                break;
                            case 4:
                                output_type = 0xD6; // fixext 4
                                break;
                            case 8:
                                output_type = 0xD7; // fixext 8
                                break;
                            case 16:
                                output_type = 0xD8; // fixext 16
                                break;
                            default:
                                output_type = 0xC7; // ext 8
                                fixed = false;
                                break;
                        }

                    }
                    else
                    {
                        output_type = 0xC4; // bin 8
                        fixed = false;
                    }

                    oa->write_character(to_char_type(output_type));
                    if (!fixed)
                    {
                        write_number(static_cast<std::uint8_t>(N));
                    }
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    std::uint8_t output_type = use_ext
                                               ? 0xC8 // ext 16
                                               : 0xC5; // bin 16

                    oa->write_character(to_char_type(output_type));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    std::uint8_t output_type = use_ext
                                               ? 0xC9 // ext 32
                                               : 0xC6; // bin 32

                    oa->write_character(to_char_type(output_type));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 1.5: if this is an ext type, write the subtype
                if (use_ext)
                {
                    write_number(static_cast<std::int8_t>(j.m_value.binary->subtype()));
                }

                // step 2: write the byte string
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
                    N);

                break;
            }

            case value_t::object:
            {
                // step 1: write control byte and the object size
                const auto N = j.m_value.object->size();
                if (N <= 15)
                {
                    // fixmap
                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
                }
                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
                {
                    // map 16
                    oa->write_character(to_char_type(0xDE));
                    write_number(static_cast<std::uint16_t>(N));
                }
                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
                {
                    // map 32
                    oa->write_character(to_char_type(0xDF));
                    write_number(static_cast<std::uint32_t>(N));
                }

                // step 2: write each element
                for (const auto& el : *j.m_value.object)
                {
                    write_msgpack(el.first);
                    write_msgpack(el.second);
                }
                break;
            }

            default:
                break;
        }
    }

    /*!
    @param[in] j  JSON value to serialize
    @param[in] use_count   whether to use '#' prefixes (optimized format)
    @param[in] use_type    whether to use '$' prefixes (optimized format)
    @param[in] add_prefix  whether prefixes need to be used for this value
    */
    void write_ubjson(const BasicJsonType& j, const bool use_count,
                      const bool use_type, const bool add_prefix = true)
    {
        switch (j.type())
        {
            case value_t::null:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('Z'));
                }
                break;
            }

            case value_t::boolean:
            {
                if (add_prefix)
                {
                    oa->write_character(j.m_value.boolean
                                        ? to_char_type('T')
                                        : to_char_type('F'));
                }
                break;
            }

            case value_t::number_integer:
            {
                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
                break;
            }

            case value_t::number_unsigned:
            {
                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
                break;
            }

            case value_t::number_float:
            {
                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
                break;
            }

            case value_t::string:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('S'));
                }
                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
                oa->write_characters(
                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
                    j.m_value.string->size());
                break;
            }

            case value_t::array:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('['));
                }

                bool prefix_required = true;
                if (use_type && !j.m_value.array->empty())
                {
                    JSON_ASSERT(use_count);
                    const CharType first_prefix = ubjson_prefix(j.front());
                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
                                                         [this, first_prefix](const BasicJsonType & v)
                    {
                        return ubjson_prefix(v) == first_prefix;
                    });

                    if (same_prefix)
                    {
                        prefix_required = false;
                        oa->write_character(to_char_type('$'));
                        oa->write_character(first_prefix);
                    }
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
                }

                for (const auto& el : *j.m_value.array)
                {
                    write_ubjson(el, use_count, use_type, prefix_required);
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type(']'));
                }

                break;
            }

            case value_t::binary:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('['));
                }

                if (use_type && !j.m_value.binary->empty())
                {
                    JSON_ASSERT(use_count);
                    oa->write_character(to_char_type('$'));
                    oa->write_character('U');
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_value.binary->size(), true);
                }

                if (use_type)
                {
                    oa->write_characters(
                        reinterpret_cast<const CharType*>(j.m_value.binary->data()),
                        j.m_value.binary->size());
                }
                else
                {
                    for (size_t i = 0; i < j.m_value.binary->size(); ++i)
                    {
                        oa->write_character(to_char_type('U'));
                        oa->write_character(j.m_value.binary->data()[i]);
                    }
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type(']'));
                }

                break;
            }

            case value_t::object:
            {
                if (add_prefix)
                {
                    oa->write_character(to_char_type('{'));
                }

                bool prefix_required = true;
                if (use_type && !j.m_value.object->empty())
                {
                    JSON_ASSERT(use_count);
                    const CharType first_prefix = ubjson_prefix(j.front());
                    const bool same_prefix = std::all_of(j.begin(), j.end(),
                                                         [this, first_prefix](const BasicJsonType & v)
                    {
                        return ubjson_prefix(v) == first_prefix;
                    });

                    if (same_prefix)
                    {
                        prefix_required = false;
                        oa->write_character(to_char_type('$'));
                        oa->write_character(first_prefix);
                    }
                }

                if (use_count)
                {
                    oa->write_character(to_char_type('#'));
                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
                }

                for (const auto& el : *j.m_value.object)
                {
                    write_number_with_ubjson_prefix(el.first.size(), true);
                    oa->write_characters(
                        reinterpret_cast<const CharType*>(el.first.c_str()),
                        el.first.size());
                    write_ubjson(el.second, use_count, use_type, prefix_required);
                }

                if (!use_count)
                {
                    oa->write_character(to_char_type('}'));
                }

                break;
            }

            default:
                break;
        }
    }

  private:
    //////////
    // BSON //
    //////////

    /*!
    @return The size of a BSON document entry header, including the id marker
            and the entry name size (and its null-terminator).
    */
    static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j)
    {
        const auto it = name.find(static_cast<typename string_t::value_type>(0));
        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
        {
            JSON_THROW(out_of_range::create(409, "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")", j));
        }

        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
    }

    /*!
    @brief Writes the given @a element_type and @a name to the output adapter
    */
    void write_bson_entry_header(const string_t& name,
                                 const std::uint8_t element_type)
    {
        oa->write_character(to_char_type(element_type)); // boolean
        oa->write_characters(
            reinterpret_cast<const CharType*>(name.c_str()),
            name.size() + 1u);
    }

    /*!
    @brief Writes a BSON element with key @a name and boolean value @a value
    */
    void write_bson_boolean(const string_t& name,
                            const bool value)
    {
        write_bson_entry_header(name, 0x08);
        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
    }

    /*!
    @brief Writes a BSON element with key @a name and double value @a value
    */
    void write_bson_double(const string_t& name,
                           const double value)
    {
        write_bson_entry_header(name, 0x01);
        write_number<double, true>(value);
    }

    /*!
    @return The size of the BSON-encoded string in @a value
    */
    static std::size_t calc_bson_string_size(const string_t& value)
    {
        return sizeof(std::int32_t) + value.size() + 1ul;
    }

    /*!
    @brief Writes a BSON element with key @a name and string value @a value
    */
    void write_bson_string(const string_t& name,
                           const string_t& value)
    {
        write_bson_entry_header(name, 0x02);

        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size() + 1ul));
        oa->write_characters(
            reinterpret_cast<const CharType*>(value.c_str()),
            value.size() + 1);
    }

    /*!
    @brief Writes a BSON element with key @a name and null value
    */
    void write_bson_null(const string_t& name)
    {
        write_bson_entry_header(name, 0x0A);
    }

    /*!
    @return The size of the BSON-encoded integer @a value
    */
    static std::size_t calc_bson_integer_size(const std::int64_t value)
    {
        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
               ? sizeof(std::int32_t)
               : sizeof(std::int64_t);
    }

    /*!
    @brief Writes a BSON element with key @a name and integer @a value
    */
    void write_bson_integer(const string_t& name,
                            const std::int64_t value)
    {
        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
        {
            write_bson_entry_header(name, 0x10); // int32
            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
        }
        else
        {
            write_bson_entry_header(name, 0x12); // int64
            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
        }
    }

    /*!
    @return The size of the BSON-encoded unsigned integer in @a j
    */
    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
    {
        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
               ? sizeof(std::int32_t)
               : sizeof(std::int64_t);
    }

    /*!
    @brief Writes a BSON element with key @a name and unsigned @a value
    */
    void write_bson_unsigned(const string_t& name,
                             const BasicJsonType& j)
    {
        if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
        {
            write_bson_entry_header(name, 0x10 /* int32 */);
            write_number<std::int32_t, true>(static_cast<std::int32_t>(j.m_value.number_unsigned));
        }
        else if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
        {
            write_bson_entry_header(name, 0x12 /* int64 */);
            write_number<std::int64_t, true>(static_cast<std::int64_t>(j.m_value.number_unsigned));
        }
        else
        {
            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(j.m_value.number_unsigned) + " cannot be represented by BSON as it does not fit int64", j));
        }
    }

    /*!
    @brief Writes a BSON element with key @a name and object @a value
    */
    void write_bson_object_entry(const string_t& name,
                                 const typename BasicJsonType::object_t& value)
    {
        write_bson_entry_header(name, 0x03); // object
        write_bson_object(value);
    }

    /*!
    @return The size of the BSON-encoded array @a value
    */
    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
    {
        std::size_t array_index = 0ul;

        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), std::size_t(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
        {
            return result + calc_bson_element_size(std::to_string(array_index++), el);
        });

        return sizeof(std::int32_t) + embedded_document_size + 1ul;
    }

    /*!
    @return The size of the BSON-encoded binary array @a value
    */
    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
    {
        return sizeof(std::int32_t) + value.size() + 1ul;
    }

    /*!
    @brief Writes a BSON element with key @a name and array @a value
    */
    void write_bson_array(const string_t& name,
                          const typename BasicJsonType::array_t& value)
    {
        write_bson_entry_header(name, 0x04); // array
        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_array_size(value)));

        std::size_t array_index = 0ul;

        for (const auto& el : value)
        {
            write_bson_element(std::to_string(array_index++), el);
        }

        oa->write_character(to_char_type(0x00));
    }

    /*!
    @brief Writes a BSON element with key @a name and binary value @a value
    */
    void write_bson_binary(const string_t& name,
                           const binary_t& value)
    {
        write_bson_entry_header(name, 0x05);

        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size()));
        write_number(value.has_subtype() ? value.subtype() : std::uint8_t(0x00));

        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
    }

    /*!
    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
    @return The calculated size for the BSON document entry for @a j with the given @a name.
    */
    static std::size_t calc_bson_element_size(const string_t& name,
            const BasicJsonType& j)
    {
        const auto header_size = calc_bson_entry_header_size(name, j);
        switch (j.type())
        {
            case value_t::object:
                return header_size + calc_bson_object_size(*j.m_value.object);

            case value_t::array:
                return header_size + calc_bson_array_size(*j.m_value.array);

            case value_t::binary:
                return header_size + calc_bson_binary_size(*j.m_value.binary);

            case value_t::boolean:
                return header_size + 1ul;

            case value_t::number_float:
                return header_size + 8ul;

            case value_t::number_integer:
                return header_size + calc_bson_integer_size(j.m_value.number_integer);

            case value_t::number_unsigned:
                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);

            case value_t::string:
                return header_size + calc_bson_string_size(*j.m_value.string);

            case value_t::null:
                return header_size + 0ul;

            // LCOV_EXCL_START
            default:
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
                return 0ul;
                // LCOV_EXCL_STOP
        }
    }

    /*!
    @brief Serializes the JSON value @a j to BSON and associates it with the
           key @a name.
    @param name The name to associate with the JSON entity @a j within the
                current BSON document
    */
    void write_bson_element(const string_t& name,
                            const BasicJsonType& j)
    {
        switch (j.type())
        {
            case value_t::object:
                return write_bson_object_entry(name, *j.m_value.object);

            case value_t::array:
                return write_bson_array(name, *j.m_value.array);

            case value_t::binary:
                return write_bson_binary(name, *j.m_value.binary);

            case value_t::boolean:
                return write_bson_boolean(name, j.m_value.boolean);

            case value_t::number_float:
                return write_bson_double(name, j.m_value.number_float);

            case value_t::number_integer:
                return write_bson_integer(name, j.m_value.number_integer);

            case value_t::number_unsigned:
                return write_bson_unsigned(name, j);

            case value_t::string:
                return write_bson_string(name, *j.m_value.string);

            case value_t::null:
                return write_bson_null(name);

            // LCOV_EXCL_START
            default:
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
                return;
                // LCOV_EXCL_STOP
        }
    }

    /*!
    @brief Calculates the size of the BSON serialization of the given
           JSON-object @a j.
    @param[in] value  JSON value to serialize
    @pre       value.type() == value_t::object
    */
    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
    {
        std::size_t document_size = std::accumulate(value.begin(), value.end(), std::size_t(0),
                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
        {
            return result += calc_bson_element_size(el.first, el.second);
        });

        return sizeof(std::int32_t) + document_size + 1ul;
    }

    /*!
    @param[in] value  JSON value to serialize
    @pre       value.type() == value_t::object
    */
    void write_bson_object(const typename BasicJsonType::object_t& value)
    {
        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_object_size(value)));

        for (const auto& el : value)
        {
            write_bson_element(el.first, el.second);
        }

        oa->write_character(to_char_type(0x00));
    }

    //////////
    // CBOR //
    //////////

    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
    {
        return to_char_type(0xFA);  // Single-Precision Float
    }

    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
    {
        return to_char_type(0xFB);  // Double-Precision Float
    }

    /////////////
    // MsgPack //
    /////////////

    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
    {
        return to_char_type(0xCA);  // float 32
    }

    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
    {
        return to_char_type(0xCB);  // float 64
    }

    ////////////
    // UBJSON //
    ////////////

    // UBJSON: write number (floating point)
    template<typename NumberType, typename std::enable_if<
                 std::is_floating_point<NumberType>::value, int>::type = 0>
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix)
    {
        if (add_prefix)
        {
            oa->write_character(get_ubjson_float_prefix(n));
        }
        write_number(n);
    }

    // UBJSON: write number (unsigned integer)
    template<typename NumberType, typename std::enable_if<
                 std::is_unsigned<NumberType>::value, int>::type = 0>
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix)
    {
        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('i'));  // int8
            }
            write_number(static_cast<std::uint8_t>(n));
        }
        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('U'));  // uint8
            }
            write_number(static_cast<std::uint8_t>(n));
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('I'));  // int16
            }
            write_number(static_cast<std::int16_t>(n));
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('l'));  // int32
            }
            write_number(static_cast<std::int32_t>(n));
        }
        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('L'));  // int64
            }
            write_number(static_cast<std::int64_t>(n));
        }
        else
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('H'));  // high-precision number
            }

            const auto number = BasicJsonType(n).dump();
            write_number_with_ubjson_prefix(number.size(), true);
            for (std::size_t i = 0; i < number.size(); ++i)
            {
                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
            }
        }
    }

    // UBJSON: write number (signed integer)
    template < typename NumberType, typename std::enable_if <
                   std::is_signed<NumberType>::value&&
                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
    void write_number_with_ubjson_prefix(const NumberType n,
                                         const bool add_prefix)
    {
        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('i'));  // int8
            }
            write_number(static_cast<std::int8_t>(n));
        }
        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('U'));  // uint8
            }
            write_number(static_cast<std::uint8_t>(n));
        }
        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('I'));  // int16
            }
            write_number(static_cast<std::int16_t>(n));
        }
        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('l'));  // int32
            }
            write_number(static_cast<std::int32_t>(n));
        }
        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('L'));  // int64
            }
            write_number(static_cast<std::int64_t>(n));
        }
        // LCOV_EXCL_START
        else
        {
            if (add_prefix)
            {
                oa->write_character(to_char_type('H'));  // high-precision number
            }

            const auto number = BasicJsonType(n).dump();
            write_number_with_ubjson_prefix(number.size(), true);
            for (std::size_t i = 0; i < number.size(); ++i)
            {
                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
            }
        }
        // LCOV_EXCL_STOP
    }

    /*!
    @brief determine the type prefix of container values
    */
    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
    {
        switch (j.type())
        {
            case value_t::null:
                return 'Z';

            case value_t::boolean:
                return j.m_value.boolean ? 'T' : 'F';

            case value_t::number_integer:
            {
                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
                {
                    return 'i';
                }
                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
                {
                    return 'U';
                }
                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
                {
                    return 'I';
                }
                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
                {
                    return 'l';
                }
                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
                {
                    return 'L';
                }
                // anything else is treated as high-precision number
                return 'H'; // LCOV_EXCL_LINE
            }

            case value_t::number_unsigned:
            {
                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
                {
                    return 'i';
                }
                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
                {
                    return 'U';
                }
                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
                {
                    return 'I';
                }
                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
                {
                    return 'l';
                }
                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
                {
                    return 'L';
                }
                // anything else is treated as high-precision number
                return 'H'; // LCOV_EXCL_LINE
            }

            case value_t::number_float:
                return get_ubjson_float_prefix(j.m_value.number_float);

            case value_t::string:
                return 'S';

            case value_t::array: // fallthrough
            case value_t::binary:
                return '[';

            case value_t::object:
                return '{';

            default:  // discarded values
                return 'N';
        }
    }

    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
    {
        return 'd';  // float 32
    }

    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
    {
        return 'D';  // float 64
    }

    ///////////////////////
    // Utility functions //
    ///////////////////////

    /*
    @brief write a number to output input
    @param[in] n number of type @a NumberType
    @tparam NumberType the type of the number
    @tparam OutputIsLittleEndian Set to true if output data is
                                 required to be little endian

    @note This function needs to respect the system's endianess, because bytes
          in CBOR, MessagePack, and UBJSON are stored in network order (big
          endian) and therefore need reordering on little endian systems.
    */
    template<typename NumberType, bool OutputIsLittleEndian = false>
    void write_number(const NumberType n)
    {
        // step 1: write number to array of length NumberType
        std::array<CharType, sizeof(NumberType)> vec{};
        std::memcpy(vec.data(), &n, sizeof(NumberType));

        // step 2: write array to output (with possible reordering)
        if (is_little_endian != OutputIsLittleEndian)
        {
            // reverse byte order prior to conversion if necessary
            std::reverse(vec.begin(), vec.end());
        }

        oa->write_characters(vec.data(), sizeof(NumberType));
    }

    void write_compact_float(const number_float_t n, detail::input_format_t format)
    {
        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
        {
            oa->write_character(format == detail::input_format_t::cbor
                                ? get_cbor_float_prefix(static_cast<float>(n))
                                : get_msgpack_float_prefix(static_cast<float>(n)));
            write_number(static_cast<float>(n));
        }
        else
        {
            oa->write_character(format == detail::input_format_t::cbor
                                ? get_cbor_float_prefix(n)
                                : get_msgpack_float_prefix(n));
            write_number(n);
        }
    }

  public:
    // The following to_char_type functions are implement the conversion
    // between uint8_t and CharType. In case CharType is not unsigned,
    // such a conversion is required to allow values greater than 128.
    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
    template < typename C = CharType,
               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
    static constexpr CharType to_char_type(std::uint8_t x) noexcept
    {
        return *reinterpret_cast<char*>(&x);
    }

    template < typename C = CharType,
               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
    static CharType to_char_type(std::uint8_t x) noexcept
    {
        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
        CharType result;
        std::memcpy(&result, &x, sizeof(x));
        return result;
    }

    template<typename C = CharType,
             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
    static constexpr CharType to_char_type(std::uint8_t x) noexcept
    {
        return x;
    }

    template < typename InputCharType, typename C = CharType,
               enable_if_t <
                   std::is_signed<C>::value &&
                   std::is_signed<char>::value &&
                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
                   > * = nullptr >
    static constexpr CharType to_char_type(InputCharType x) noexcept
    {
        return x;
    }

  private:
    /// whether we can assume little endianess
    const bool is_little_endian = little_endianess();

    /// the output
    output_adapter_t<CharType> oa = nullptr;
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/output/output_adapters.hpp>

// #include <nlohmann/detail/output/serializer.hpp>


#include <algorithm> // reverse, remove, fill, find, none_of
#include <array> // array
#include <clocale> // localeconv, lconv
#include <cmath> // labs, isfinite, isnan, signbit
#include <cstddef> // size_t, ptrdiff_t
#include <cstdint> // uint8_t
#include <cstdio> // snprintf
#include <limits> // numeric_limits
#include <string> // string, char_traits
#include <type_traits> // is_same
#include <utility> // move

// #include <nlohmann/detail/conversions/to_chars.hpp>


#include <array> // array
#include <cmath>   // signbit, isfinite
#include <cstdint> // intN_t, uintN_t
#include <cstring> // memcpy, memmove
#include <limits> // numeric_limits
#include <type_traits> // conditional

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{
namespace detail
{

/*!
@brief implements the Grisu2 algorithm for binary to decimal floating-point
conversion.

This implementation is a slightly modified version of the reference
implementation which may be obtained from
http://florian.loitsch.com/publications (bench.tar.gz).

The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.

For a detailed description of the algorithm see:

[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
    Language Design and Implementation, PLDI 2010
[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
    Design and Implementation, PLDI 1996
*/
namespace dtoa_impl
{

template<typename Target, typename Source>
Target reinterpret_bits(const Source source)
{
    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");

    Target target;
    std::memcpy(&target, &source, sizeof(Source));
    return target;
}

struct diyfp // f * 2^e
{
    static constexpr int kPrecision = 64; // = q

    std::uint64_t f = 0;
    int e = 0;

    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}

    /*!
    @brief returns x - y
    @pre x.e == y.e and x.f >= y.f
    */
    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
    {
        JSON_ASSERT(x.e == y.e);
        JSON_ASSERT(x.f >= y.f);

        return {x.f - y.f, x.e};
    }

    /*!
    @brief returns x * y
    @note The result is rounded. (Only the upper q bits are returned.)
    */
    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
    {
        static_assert(kPrecision == 64, "internal error");

        // Computes:
        //  f = round((x.f * y.f) / 2^q)
        //  e = x.e + y.e + q

        // Emulate the 64-bit * 64-bit multiplication:
        //
        // p = u * v
        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
        //
        // (Since Q might be larger than 2^32 - 1)
        //
        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
        //
        // (Q_hi + H does not overflow a 64-bit int)
        //
        //   = p_lo + 2^64 p_hi

        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
        const std::uint64_t u_hi = x.f >> 32u;
        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
        const std::uint64_t v_hi = y.f >> 32u;

        const std::uint64_t p0 = u_lo * v_lo;
        const std::uint64_t p1 = u_lo * v_hi;
        const std::uint64_t p2 = u_hi * v_lo;
        const std::uint64_t p3 = u_hi * v_hi;

        const std::uint64_t p0_hi = p0 >> 32u;
        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
        const std::uint64_t p1_hi = p1 >> 32u;
        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
        const std::uint64_t p2_hi = p2 >> 32u;

        std::uint64_t Q = p0_hi + p1_lo + p2_lo;

        // The full product might now be computed as
        //
        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
        // p_lo = p0_lo + (Q << 32)
        //
        // But in this particular case here, the full p_lo is not required.
        // Effectively we only need to add the highest bit in p_lo to p_hi (and
        // Q_hi + 1 does not overflow).

        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up

        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);

        return {h, x.e + y.e + 64};
    }

    /*!
    @brief normalize x such that the significand is >= 2^(q-1)
    @pre x.f != 0
    */
    static diyfp normalize(diyfp x) noexcept
    {
        JSON_ASSERT(x.f != 0);

        while ((x.f >> 63u) == 0)
        {
            x.f <<= 1u;
            x.e--;
        }

        return x;
    }

    /*!
    @brief normalize x such that the result has the exponent E
    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
    */
    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
    {
        const int delta = x.e - target_exponent;

        JSON_ASSERT(delta >= 0);
        JSON_ASSERT(((x.f << delta) >> delta) == x.f);

        return {x.f << delta, target_exponent};
    }
};

struct boundaries
{
    diyfp w;
    diyfp minus;
    diyfp plus;
};

/*!
Compute the (normalized) diyfp representing the input number 'value' and its
boundaries.

@pre value must be finite and positive
*/
template<typename FloatType>
boundaries compute_boundaries(FloatType value)
{
    JSON_ASSERT(std::isfinite(value));
    JSON_ASSERT(value > 0);

    // Convert the IEEE representation into a diyfp.
    //
    // If v is denormal:
    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
    // If v is normalized:
    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))

    static_assert(std::numeric_limits<FloatType>::is_iec559,
                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");

    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
    constexpr int      kMinExp    = 1 - kBias;
    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)

    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;

    const auto bits = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
    const std::uint64_t E = bits >> (kPrecision - 1);
    const std::uint64_t F = bits & (kHiddenBit - 1);

    const bool is_denormal = E == 0;
    const diyfp v = is_denormal
                    ? diyfp(F, kMinExp)
                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);

    // Compute the boundaries m- and m+ of the floating-point value
    // v = f * 2^e.
    //
    // Determine v- and v+, the floating-point predecessor and successor if v,
    // respectively.
    //
    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
    //
    //      v+ = v + 2^e
    //
    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
    // between m- and m+ round to v, regardless of how the input rounding
    // algorithm breaks ties.
    //
    //      ---+-------------+-------------+-------------+-------------+---  (A)
    //         v-            m-            v             m+            v+
    //
    //      -----------------+------+------+-------------+-------------+---  (B)
    //                       v-     m-     v             m+            v+

    const bool lower_boundary_is_closer = F == 0 && E > 1;
    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
    const diyfp m_minus = lower_boundary_is_closer
                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
                          : diyfp(2 * v.f - 1, v.e - 1); // (A)

    // Determine the normalized w+ = m+.
    const diyfp w_plus = diyfp::normalize(m_plus);

    // Determine w- = m- such that e_(w-) = e_(w+).
    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);

    return {diyfp::normalize(v), w_minus, w_plus};
}

// Given normalized diyfp w, Grisu needs to find a (normalized) cached
// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
// within a certain range [alpha, gamma] (Definition 3.2 from [1])
//
//      alpha <= e = e_c + e_w + q <= gamma
//
// or
//
//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
//                          <= f_c * f_w * 2^gamma
//
// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
//
//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
//
// or
//
//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
//
// The choice of (alpha,gamma) determines the size of the table and the form of
// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
// in practice:
//
// The idea is to cut the number c * w = f * 2^e into two parts, which can be
// processed independently: An integral part p1, and a fractional part p2:
//
//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
//              = (f div 2^-e) + (f mod 2^-e) * 2^e
//              = p1 + p2 * 2^e
//
// The conversion of p1 into decimal form requires a series of divisions and
// modulos by (a power of) 10. These operations are faster for 32-bit than for
// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
// achieved by choosing
//
//      -e >= 32   or   e <= -32 := gamma
//
// In order to convert the fractional part
//
//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
//
// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
// d[-i] are extracted in order:
//
//      (10 * p2) div 2^-e = d[-1]
//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
//
// The multiplication by 10 must not overflow. It is sufficient to choose
//
//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
//
// Since p2 = f mod 2^-e < 2^-e,
//
//      -e <= 60   or   e >= -60 := alpha

constexpr int kAlpha = -60;
constexpr int kGamma = -32;

struct cached_power // c = f * 2^e ~= 10^k
{
    std::uint64_t f;
    int e;
    int k;
};

/*!
For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
satisfies (Definition 3.2 from [1])

     alpha <= e_c + e + q <= gamma.
*/
inline cached_power get_cached_power_for_binary_exponent(int e)
{
    // Now
    //
    //      alpha <= e_c + e + q <= gamma                                    (1)
    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
    //
    // and since the c's are normalized, 2^(q-1) <= f_c,
    //
    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
    //      ==> 2^(alpha - e - 1) <= c
    //
    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
    //
    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
    //        = ceil( (alpha - e - 1) * log_10(2) )
    //
    // From the paper:
    // "In theory the result of the procedure could be wrong since c is rounded,
    //  and the computation itself is approximated [...]. In practice, however,
    //  this simple function is sufficient."
    //
    // For IEEE double precision floating-point numbers converted into
    // normalized diyfp's w = f * 2^e, with q = 64,
    //
    //      e >= -1022      (min IEEE exponent)
    //           -52        (p - 1)
    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
    //           -11        (normalize the diyfp)
    //         = -1137
    //
    // and
    //
    //      e <= +1023      (max IEEE exponent)
    //           -52        (p - 1)
    //           -11        (normalize the diyfp)
    //         = 960
    //
    // This binary exponent range [-1137,960] results in a decimal exponent
    // range [-307,324]. One does not need to store a cached power for each
    // k in this range. For each such k it suffices to find a cached power
    // such that the exponent of the product lies in [alpha,gamma].
    // This implies that the difference of the decimal exponents of adjacent
    // table entries must be less than or equal to
    //
    //      floor( (gamma - alpha) * log_10(2) ) = 8.
    //
    // (A smaller distance gamma-alpha would require a larger table.)

    // NB:
    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.

    constexpr int kCachedPowersMinDecExp = -300;
    constexpr int kCachedPowersDecStep = 8;

    static constexpr std::array<cached_power, 79> kCachedPowers =
    {
        {
            { 0xAB70FE17C79AC6CA, -1060, -300 },
            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
            { 0xBE5691EF416BD60C, -1007, -284 },
            { 0x8DD01FAD907FFC3C,  -980, -276 },
            { 0xD3515C2831559A83,  -954, -268 },
            { 0x9D71AC8FADA6C9B5,  -927, -260 },
            { 0xEA9C227723EE8BCB,  -901, -252 },
            { 0xAECC49914078536D,  -874, -244 },
            { 0x823C12795DB6CE57,  -847, -236 },
            { 0xC21094364DFB5637,  -821, -228 },
            { 0x9096EA6F3848984F,  -794, -220 },
            { 0xD77485CB25823AC7,  -768, -212 },
            { 0xA086CFCD97BF97F4,  -741, -204 },
            { 0xEF340A98172AACE5,  -715, -196 },
            { 0xB23867FB2A35B28E,  -688, -188 },
            { 0x84C8D4DFD2C63F3B,  -661, -180 },
            { 0xC5DD44271AD3CDBA,  -635, -172 },
            { 0x936B9FCEBB25C996,  -608, -164 },
            { 0xDBAC6C247D62A584,  -582, -156 },
            { 0xA3AB66580D5FDAF6,  -555, -148 },
            { 0xF3E2F893DEC3F126,  -529, -140 },
            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
            { 0x87625F056C7C4A8B,  -475, -124 },
            { 0xC9BCFF6034C13053,  -449, -116 },
            { 0x964E858C91BA2655,  -422, -108 },
            { 0xDFF9772470297EBD,  -396, -100 },
            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
            { 0xF8A95FCF88747D94,  -343,  -84 },
            { 0xB94470938FA89BCF,  -316,  -76 },
            { 0x8A08F0F8BF0F156B,  -289,  -68 },
            { 0xCDB02555653131B6,  -263,  -60 },
            { 0x993FE2C6D07B7FAC,  -236,  -52 },
            { 0xE45C10C42A2B3B06,  -210,  -44 },
            { 0xAA242499697392D3,  -183,  -36 },
            { 0xFD87B5F28300CA0E,  -157,  -28 },
            { 0xBCE5086492111AEB,  -130,  -20 },
            { 0x8CBCCC096F5088CC,  -103,  -12 },
            { 0xD1B71758E219652C,   -77,   -4 },
            { 0x9C40000000000000,   -50,    4 },
            { 0xE8D4A51000000000,   -24,   12 },
            { 0xAD78EBC5AC620000,     3,   20 },
            { 0x813F3978F8940984,    30,   28 },
            { 0xC097CE7BC90715B3,    56,   36 },
            { 0x8F7E32CE7BEA5C70,    83,   44 },
            { 0xD5D238A4ABE98068,   109,   52 },
            { 0x9F4F2726179A2245,   136,   60 },
            { 0xED63A231D4C4FB27,   162,   68 },
            { 0xB0DE65388CC8ADA8,   189,   76 },
            { 0x83C7088E1AAB65DB,   216,   84 },
            { 0xC45D1DF942711D9A,   242,   92 },
            { 0x924D692CA61BE758,   269,  100 },
            { 0xDA01EE641A708DEA,   295,  108 },
            { 0xA26DA3999AEF774A,   322,  116 },
            { 0xF209787BB47D6B85,   348,  124 },
            { 0xB454E4A179DD1877,   375,  132 },
            { 0x865B86925B9BC5C2,   402,  140 },
            { 0xC83553C5C8965D3D,   428,  148 },
            { 0x952AB45CFA97A0B3,   455,  156 },
            { 0xDE469FBD99A05FE3,   481,  164 },
            { 0xA59BC234DB398C25,   508,  172 },
            { 0xF6C69A72A3989F5C,   534,  180 },
            { 0xB7DCBF5354E9BECE,   561,  188 },
            { 0x88FCF317F22241E2,   588,  196 },
            { 0xCC20CE9BD35C78A5,   614,  204 },
            { 0x98165AF37B2153DF,   641,  212 },
            { 0xE2A0B5DC971F303A,   667,  220 },
            { 0xA8D9D1535CE3B396,   694,  228 },
            { 0xFB9B7CD9A4A7443C,   720,  236 },
            { 0xBB764C4CA7A44410,   747,  244 },
            { 0x8BAB8EEFB6409C1A,   774,  252 },
            { 0xD01FEF10A657842C,   800,  260 },
            { 0x9B10A4E5E9913129,   827,  268 },
            { 0xE7109BFBA19C0C9D,   853,  276 },
            { 0xAC2820D9623BF429,   880,  284 },
            { 0x80444B5E7AA7CF85,   907,  292 },
            { 0xBF21E44003ACDD2D,   933,  300 },
            { 0x8E679C2F5E44FF8F,   960,  308 },
            { 0xD433179D9C8CB841,   986,  316 },
            { 0x9E19DB92B4E31BA9,  1013,  324 },
        }
    };

    // This computation gives exactly the same results for k as
    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
    // for |e| <= 1500, but doesn't require floating-point operations.
    // NB: log_10(2) ~= 78913 / 2^18
    JSON_ASSERT(e >= -1500);
    JSON_ASSERT(e <=  1500);
    const int f = kAlpha - e - 1;
    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);

    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
    JSON_ASSERT(index >= 0);
    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());

    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
    JSON_ASSERT(kAlpha <= cached.e + e + 64);
    JSON_ASSERT(kGamma >= cached.e + e + 64);

    return cached;
}

/*!
For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
For n == 0, returns 1 and sets pow10 := 1.
*/
inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
{
    // LCOV_EXCL_START
    if (n >= 1000000000)
    {
        pow10 = 1000000000;
        return 10;
    }
    // LCOV_EXCL_STOP
    if (n >= 100000000)
    {
        pow10 = 100000000;
        return  9;
    }
    if (n >= 10000000)
    {
        pow10 = 10000000;
        return  8;
    }
    if (n >= 1000000)
    {
        pow10 = 1000000;
        return  7;
    }
    if (n >= 100000)
    {
        pow10 = 100000;
        return  6;
    }
    if (n >= 10000)
    {
        pow10 = 10000;
        return  5;
    }
    if (n >= 1000)
    {
        pow10 = 1000;
        return  4;
    }
    if (n >= 100)
    {
        pow10 = 100;
        return  3;
    }
    if (n >= 10)
    {
        pow10 = 10;
        return  2;
    }

    pow10 = 1;
    return 1;
}

inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
                         std::uint64_t rest, std::uint64_t ten_k)
{
    JSON_ASSERT(len >= 1);
    JSON_ASSERT(dist <= delta);
    JSON_ASSERT(rest <= delta);
    JSON_ASSERT(ten_k > 0);

    //               <--------------------------- delta ---->
    //                                  <---- dist --------->
    // --------------[------------------+-------------------]--------------
    //               M-                 w                   M+
    //
    //                                  ten_k
    //                                <------>
    //                                       <---- rest ---->
    // --------------[------------------+----+--------------]--------------
    //                                  w    V
    //                                       = buf * 10^k
    //
    // ten_k represents a unit-in-the-last-place in the decimal representation
    // stored in buf.
    // Decrement buf by ten_k while this takes buf closer to w.

    // The tests are written in this order to avoid overflow in unsigned
    // integer arithmetic.

    while (rest < dist
            && delta - rest >= ten_k
            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
    {
        JSON_ASSERT(buf[len - 1] != '0');
        buf[len - 1]--;
        rest += ten_k;
    }
}

/*!
Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
*/
inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
                             diyfp M_minus, diyfp w, diyfp M_plus)
{
    static_assert(kAlpha >= -60, "internal error");
    static_assert(kGamma <= -32, "internal error");

    // Generates the digits (and the exponent) of a decimal floating-point
    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
    //
    //               <--------------------------- delta ---->
    //                                  <---- dist --------->
    // --------------[------------------+-------------------]--------------
    //               M-                 w                   M+
    //
    // Grisu2 generates the digits of M+ from left to right and stops as soon as
    // V is in [M-,M+].

    JSON_ASSERT(M_plus.e >= kAlpha);
    JSON_ASSERT(M_plus.e <= kGamma);

    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)

    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
    //
    //      M+ = f * 2^e
    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
    //         = p1 + p2 * 2^e

    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);

    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e

    // 1)
    //
    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]

    JSON_ASSERT(p1 > 0);

    std::uint32_t pow10{};
    const int k = find_largest_pow10(p1, pow10);

    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
    //
    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
    //
    //      M+ = p1                                             + p2 * 2^e
    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
    //
    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
    //
    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
    //
    // but stop as soon as
    //
    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e

    int n = k;
    while (n > 0)
    {
        // Invariants:
        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
        //      pow10 = 10^(n-1) <= p1 < 10^n
        //
        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
        //
        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
        //
        JSON_ASSERT(d <= 9);
        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
        //
        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
        //
        p1 = r;
        n--;
        //
        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
        //      pow10 = 10^n
        //

        // Now check if enough digits have been generated.
        // Compute
        //
        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
        //
        // Note:
        // Since rest and delta share the same exponent e, it suffices to
        // compare the significands.
        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
        if (rest <= delta)
        {
            // V = buffer * 10^n, with M- <= V <= M+.

            decimal_exponent += n;

            // We may now just stop. But instead look if the buffer could be
            // decremented to bring V closer to w.
            //
            // pow10 = 10^n is now 1 ulp in the decimal representation V.
            // The rounding procedure works with diyfp's with an implicit
            // exponent of e.
            //
            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
            //
            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
            grisu2_round(buffer, length, dist, delta, rest, ten_n);

            return;
        }

        pow10 /= 10;
        //
        //      pow10 = 10^(n-1) <= p1 < 10^n
        // Invariants restored.
    }

    // 2)
    //
    // The digits of the integral part have been generated:
    //
    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
    //         = buffer            + p2 * 2^e
    //
    // Now generate the digits of the fractional part p2 * 2^e.
    //
    // Note:
    // No decimal point is generated: the exponent is adjusted instead.
    //
    // p2 actually represents the fraction
    //
    //      p2 * 2^e
    //          = p2 / 2^-e
    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
    //
    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
    //
    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
    //
    // using
    //
    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
    //                = (                   d) * 2^-e + (                   r)
    //
    // or
    //      10^m * p2 * 2^e = d + r * 2^e
    //
    // i.e.
    //
    //      M+ = buffer + p2 * 2^e
    //         = buffer + 10^-m * (d + r * 2^e)
    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
    //
    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e

    JSON_ASSERT(p2 > delta);

    int m = 0;
    for (;;)
    {
        // Invariant:
        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
        //
        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
        p2 *= 10;
        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
        //
        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
        //
        JSON_ASSERT(d <= 9);
        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
        //
        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
        //
        p2 = r;
        m++;
        //
        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
        // Invariant restored.

        // Check if enough digits have been generated.
        //
        //      10^-m * p2 * 2^e <= delta * 2^e
        //              p2 * 2^e <= 10^m * delta * 2^e
        //                    p2 <= 10^m * delta
        delta *= 10;
        dist  *= 10;
        if (p2 <= delta)
        {
            break;
        }
    }

    // V = buffer * 10^-m, with M- <= V <= M+.

    decimal_exponent -= m;

    // 1 ulp in the decimal representation is now 10^-m.
    // Since delta and dist are now scaled by 10^m, we need to do the
    // same with ulp in order to keep the units in sync.
    //
    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
    //
    const std::uint64_t ten_m = one.f;
    grisu2_round(buffer, length, dist, delta, p2, ten_m);

    // By construction this algorithm generates the shortest possible decimal
    // number (Loitsch, Theorem 6.2) which rounds back to w.
    // For an input number of precision p, at least
    //
    //      N = 1 + ceil(p * log_10(2))
    //
    // decimal digits are sufficient to identify all binary floating-point
    // numbers (Matula, "In-and-Out conversions").
    // This implies that the algorithm does not produce more than N decimal
    // digits.
    //
    //      N = 17 for p = 53 (IEEE double precision)
    //      N = 9  for p = 24 (IEEE single precision)
}

/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
JSON_HEDLEY_NON_NULL(1)
inline void grisu2(char* buf, int& len, int& decimal_exponent,
                   diyfp m_minus, diyfp v, diyfp m_plus)
{
    JSON_ASSERT(m_plus.e == m_minus.e);
    JSON_ASSERT(m_plus.e == v.e);

    //  --------(-----------------------+-----------------------)--------    (A)
    //          m-                      v                       m+
    //
    //  --------------------(-----------+-----------------------)--------    (B)
    //                      m-          v                       m+
    //
    // First scale v (and m- and m+) such that the exponent is in the range
    // [alpha, gamma].

    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);

    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k

    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
    const diyfp w       = diyfp::mul(v,       c_minus_k);
    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);

    //  ----(---+---)---------------(---+---)---------------(---+---)----
    //          w-                      w                       w+
    //          = c*m-                  = c*v                   = c*m+
    //
    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
    // w+ are now off by a small amount.
    // In fact:
    //
    //      w - v * 10^k < 1 ulp
    //
    // To account for this inaccuracy, add resp. subtract 1 ulp.
    //
    //  --------+---[---------------(---+---)---------------]---+--------
    //          w-  M-                  w                   M+  w+
    //
    // Now any number in [M-, M+] (bounds included) will round to w when input,
    // regardless of how the input rounding algorithm breaks ties.
    //
    // And digit_gen generates the shortest possible such number in [M-, M+].
    // Note that this does not mean that Grisu2 always generates the shortest
    // possible number in the interval (m-, m+).
    const diyfp M_minus(w_minus.f + 1, w_minus.e);
    const diyfp M_plus (w_plus.f  - 1, w_plus.e );

    decimal_exponent = -cached.k; // = -(-k) = k

    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
}

/*!
v = buf * 10^decimal_exponent
len is the length of the buffer (number of decimal digits)
The buffer must be large enough, i.e. >= max_digits10.
*/
template<typename FloatType>
JSON_HEDLEY_NON_NULL(1)
void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
{
    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
                  "internal error: not enough precision");

    JSON_ASSERT(std::isfinite(value));
    JSON_ASSERT(value > 0);

    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
    // decimal representations are not exactly "short".
    //
    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
    // does.
    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
    // representation using the corresponding std::from_chars function recovers value exactly". That
    // indicates that single precision floating-point numbers should be recovered using
    // 'std::strtof'.
    //
    // NB: If the neighbors are computed for single-precision numbers, there is a single float
    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
    //     value is off by 1 ulp.
#if 0
    const boundaries w = compute_boundaries(static_cast<double>(value));
#else
    const boundaries w = compute_boundaries(value);
#endif

    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
}

/*!
@brief appends a decimal representation of e to buf
@return a pointer to the element following the exponent.
@pre -1000 < e < 1000
*/
JSON_HEDLEY_NON_NULL(1)
JSON_HEDLEY_RETURNS_NON_NULL
inline char* append_exponent(char* buf, int e)
{
    JSON_ASSERT(e > -1000);
    JSON_ASSERT(e <  1000);

    if (e < 0)
    {
        e = -e;
        *buf++ = '-';
    }
    else
    {
        *buf++ = '+';
    }

    auto k = static_cast<std::uint32_t>(e);
    if (k < 10)
    {
        // Always print at least two digits in the exponent.
        // This is for compatibility with printf("%g").
        *buf++ = '0';
        *buf++ = static_cast<char>('0' + k);
    }
    else if (k < 100)
    {
        *buf++ = static_cast<char>('0' + k / 10);
        k %= 10;
        *buf++ = static_cast<char>('0' + k);
    }
    else
    {
        *buf++ = static_cast<char>('0' + k / 100);
        k %= 100;
        *buf++ = static_cast<char>('0' + k / 10);
        k %= 10;
        *buf++ = static_cast<char>('0' + k);
    }

    return buf;
}

/*!
@brief prettify v = buf * 10^decimal_exponent

If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
notation. Otherwise it will be printed in exponential notation.

@pre min_exp < 0
@pre max_exp > 0
*/
JSON_HEDLEY_NON_NULL(1)
JSON_HEDLEY_RETURNS_NON_NULL
inline char* format_buffer(char* buf, int len, int decimal_exponent,
                           int min_exp, int max_exp)
{
    JSON_ASSERT(min_exp < 0);
    JSON_ASSERT(max_exp > 0);

    const int k = len;
    const int n = len + decimal_exponent;

    // v = buf * 10^(n-k)
    // k is the length of the buffer (number of decimal digits)
    // n is the position of the decimal point relative to the start of the buffer.

    if (k <= n && n <= max_exp)
    {
        // digits[000]
        // len <= max_exp + 2

        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
        // Make it look like a floating-point number (#362, #378)
        buf[n + 0] = '.';
        buf[n + 1] = '0';
        return buf + (static_cast<size_t>(n) + 2);
    }

    if (0 < n && n <= max_exp)
    {
        // dig.its
        // len <= max_digits10 + 1

        JSON_ASSERT(k > n);

        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
        buf[n] = '.';
        return buf + (static_cast<size_t>(k) + 1U);
    }

    if (min_exp < n && n <= 0)
    {
        // 0.[000]digits
        // len <= 2 + (-min_exp - 1) + max_digits10

        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
        buf[0] = '0';
        buf[1] = '.';
        std::memset(buf + 2, '0', static_cast<size_t>(-n));
        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
    }

    if (k == 1)
    {
        // dE+123
        // len <= 1 + 5

        buf += 1;
    }
    else
    {
        // d.igitsE+123
        // len <= max_digits10 + 1 + 5

        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
        buf[1] = '.';
        buf += 1 + static_cast<size_t>(k);
    }

    *buf++ = 'e';
    return append_exponent(buf, n - 1);
}

} // namespace dtoa_impl

/*!
@brief generates a decimal representation of the floating-point number value in [first, last).

The format of the resulting decimal representation is similar to printf's %g
format. Returns an iterator pointing past-the-end of the decimal representation.

@note The input number must be finite, i.e. NaN's and Inf's are not supported.
@note The buffer must be large enough.
@note The result is NOT null-terminated.
*/
template<typename FloatType>
JSON_HEDLEY_NON_NULL(1, 2)
JSON_HEDLEY_RETURNS_NON_NULL
char* to_chars(char* first, const char* last, FloatType value)
{
    static_cast<void>(last); // maybe unused - fix warning
    JSON_ASSERT(std::isfinite(value));

    // Use signbit(value) instead of (value < 0) since signbit works for -0.
    if (std::signbit(value))
    {
        value = -value;
        *first++ = '-';
    }

    if (value == 0) // +-0
    {
        *first++ = '0';
        // Make it look like a floating-point number (#362, #378)
        *first++ = '.';
        *first++ = '0';
        return first;
    }

    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);

    // Compute v = buffer * 10^decimal_exponent.
    // The decimal digits are stored in the buffer, which needs to be interpreted
    // as an unsigned decimal integer.
    // len is the length of the buffer, i.e. the number of decimal digits.
    int len = 0;
    int decimal_exponent = 0;
    dtoa_impl::grisu2(first, len, decimal_exponent, value);

    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);

    // Format the buffer like printf("%.*g", prec, value)
    constexpr int kMinExp = -4;
    // Use digits10 here to increase compatibility with version 2.
    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;

    JSON_ASSERT(last - first >= kMaxExp + 2);
    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);

    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
}

} // namespace detail
} // namespace nlohmann

// #include <nlohmann/detail/exceptions.hpp>

// #include <nlohmann/detail/macro_scope.hpp>

// #include <nlohmann/detail/meta/cpp_future.hpp>

// #include <nlohmann/detail/output/binary_writer.hpp>

// #include <nlohmann/detail/output/output_adapters.hpp>

// #include <nlohmann/detail/value_t.hpp>


namespace nlohmann
{
namespace detail
{
///////////////////
// serialization //
///////////////////

/// how to treat decoding errors
enum class error_handler_t
{
    strict,  ///< throw a type_error exception in case of invalid UTF-8
    replace, ///< replace invalid UTF-8 sequences with U+FFFD
    ignore   ///< ignore invalid UTF-8 sequences
};

template<typename BasicJsonType>
class serializer
{
    using string_t = typename BasicJsonType::string_t;
    using number_float_t = typename BasicJsonType::number_float_t;
    using number_integer_t = typename BasicJsonType::number_integer_t;
    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
    using binary_char_t = typename BasicJsonType::binary_t::value_type;
    static constexpr std::uint8_t UTF8_ACCEPT = 0;
    static constexpr std::uint8_t UTF8_REJECT = 1;

  public:
    /*!
    @param[in] s  output stream to serialize to
    @param[in] ichar  indentation character to use
    @param[in] error_handler_  how to react on decoding errors
    */
    serializer(output_adapter_t<char> s, const char ichar,
               error_handler_t error_handler_ = error_handler_t::strict)
        : o(std::move(s))
        , loc(std::localeconv())
        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
        , indent_char(ichar)
        , indent_string(512, indent_char)
        , error_handler(error_handler_)
    {}

    // delete because of pointer members
    serializer(const serializer&) = delete;
    serializer& operator=(const serializer&) = delete;
    serializer(serializer&&) = delete;
    serializer& operator=(serializer&&) = delete;
    ~serializer() = default;

    /*!
    @brief internal implementation of the serialization function

    This function is called by the public member function dump and organizes
    the serialization internally. The indentation level is propagated as
    additional parameter. In case of arrays and objects, the function is
    called recursively.

    - strings and object keys are escaped using `escape_string()`
    - integer numbers are converted implicitly via `operator<<`
    - floating-point numbers are converted to a string using `"%g"` format
    - binary values are serialized as objects containing the subtype and the
      byte array

    @param[in] val               value to serialize
    @param[in] pretty_print      whether the output shall be pretty-printed
    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
    in the output are escaped with `\uXXXX` sequences, and the result consists
    of ASCII characters only.
    @param[in] indent_step       the indent level
    @param[in] current_indent    the current indent level (only used internally)
    */
    void dump(const BasicJsonType& val,
              const bool pretty_print,
              const bool ensure_ascii,
              const unsigned int indent_step,
              const unsigned int current_indent = 0)
    {
        switch (val.m_type)
        {
            case value_t::object:
            {
                if (val.m_value.object->empty())
                {
                    o->write_characters("{}", 2);
                    return;
                }

                if (pretty_print)
                {
                    o->write_characters("{\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    // first n-1 elements
                    auto i = val.m_value.object->cbegin();
                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
                    {
                        o->write_characters(indent_string.c_str(), new_indent);
                        o->write_character('\"');
                        dump_escaped(i->first, ensure_ascii);
                        o->write_characters("\": ", 3);
                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
                        o->write_characters(",\n", 2);
                    }

                    // last element
                    JSON_ASSERT(i != val.m_value.object->cend());
                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
                    o->write_characters(indent_string.c_str(), new_indent);
                    o->write_character('\"');
                    dump_escaped(i->first, ensure_ascii);
                    o->write_characters("\": ", 3);
                    dump(i->second, true, ensure_ascii, indent_step, new_indent);

                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character('}');
                }
                else
                {
                    o->write_character('{');

                    // first n-1 elements
                    auto i = val.m_value.object->cbegin();
                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
                    {
                        o->write_character('\"');
                        dump_escaped(i->first, ensure_ascii);
                        o->write_characters("\":", 2);
                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
                        o->write_character(',');
                    }

                    // last element
                    JSON_ASSERT(i != val.m_value.object->cend());
                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
                    o->write_character('\"');
                    dump_escaped(i->first, ensure_ascii);
                    o->write_characters("\":", 2);
                    dump(i->second, false, ensure_ascii, indent_step, current_indent);

                    o->write_character('}');
                }

                return;
            }

            case value_t::array:
            {
                if (val.m_value.array->empty())
                {
                    o->write_characters("[]", 2);
                    return;
                }

                if (pretty_print)
                {
                    o->write_characters("[\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    // first n-1 elements
                    for (auto i = val.m_value.array->cbegin();
                            i != val.m_value.array->cend() - 1; ++i)
                    {
                        o->write_characters(indent_string.c_str(), new_indent);
                        dump(*i, true, ensure_ascii, indent_step, new_indent);
                        o->write_characters(",\n", 2);
                    }

                    // last element
                    JSON_ASSERT(!val.m_value.array->empty());
                    o->write_characters(indent_string.c_str(), new_indent);
                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);

                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character(']');
                }
                else
                {
                    o->write_character('[');

                    // first n-1 elements
                    for (auto i = val.m_value.array->cbegin();
                            i != val.m_value.array->cend() - 1; ++i)
                    {
                        dump(*i, false, ensure_ascii, indent_step, current_indent);
                        o->write_character(',');
                    }

                    // last element
                    JSON_ASSERT(!val.m_value.array->empty());
                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);

                    o->write_character(']');
                }

                return;
            }

            case value_t::string:
            {
                o->write_character('\"');
                dump_escaped(*val.m_value.string, ensure_ascii);
                o->write_character('\"');
                return;
            }

            case value_t::binary:
            {
                if (pretty_print)
                {
                    o->write_characters("{\n", 2);

                    // variable to hold indentation for recursive calls
                    const auto new_indent = current_indent + indent_step;
                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
                    {
                        indent_string.resize(indent_string.size() * 2, ' ');
                    }

                    o->write_characters(indent_string.c_str(), new_indent);

                    o->write_characters("\"bytes\": [", 10);

                    if (!val.m_value.binary->empty())
                    {
                        for (auto i = val.m_value.binary->cbegin();
                                i != val.m_value.binary->cend() - 1; ++i)
                        {
                            dump_integer(*i);
                            o->write_characters(", ", 2);
                        }
                        dump_integer(val.m_value.binary->back());
                    }

                    o->write_characters("],\n", 3);
                    o->write_characters(indent_string.c_str(), new_indent);

                    o->write_characters("\"subtype\": ", 11);
                    if (val.m_value.binary->has_subtype())
                    {
                        dump_integer(val.m_value.binary->subtype());
                    }
                    else
                    {
                        o->write_characters("null", 4);
                    }
                    o->write_character('\n');
                    o->write_characters(indent_string.c_str(), current_indent);
                    o->write_character('}');
                }
                else
                {
                    o->write_characters("{\"bytes\":[", 10);

                    if (!val.m_value.binary->empty())
                    {
                        for (auto i = val.m_value.binary->cbegin();
                                i != val.m_value.binary->cend() - 1; ++i)
                        {
                            dump_integer(*i);
                            o->write_character(',');
                        }
                        dump_integer(val.m_value.binary->back());
                    }

                    o->write_characters("],\"subtype\":", 12);
                    if (val.m_value.binary->has_subtype())
                    {
                        dump_integer(val.m_value.binary->subtype());
                        o->write_character('}');
                    }
                    else
                    {
                        o->write_characters("null}", 5);
                    }
                }
                return;
            }

            case value_t::boolean:
            {
                if (val.m_value.boolean)
                {
                    o->write_characters("true", 4);
                }
                else
                {
                    o->write_characters("false", 5);
                }
                return;
            }

            case value_t::number_integer:
            {
                dump_integer(val.m_value.number_integer);
                return;
            }

            case value_t::number_unsigned:
            {
                dump_integer(val.m_value.number_unsigned);
                return;
            }

            case value_t::number_float:
            {
                dump_float(val.m_value.number_float);
                return;
            }

            case value_t::discarded:
            {
                o->write_characters("<discarded>", 11);
                return;
            }

            case value_t::null:
            {
                o->write_characters("null", 4);
                return;
            }

            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }
    }

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief dump escaped string

    Escape a string by replacing certain special characters by a sequence of an
    escape character (backslash) and another character and other control
    characters by a sequence of "\u" followed by a four-digit hex
    representation. The escaped string is written to output stream @a o.

    @param[in] s  the string to escape
    @param[in] ensure_ascii  whether to escape non-ASCII characters with
                             \uXXXX sequences

    @complexity Linear in the length of string @a s.
    */
    void dump_escaped(const string_t& s, const bool ensure_ascii)
    {
        std::uint32_t codepoint{};
        std::uint8_t state = UTF8_ACCEPT;
        std::size_t bytes = 0;  // number of bytes written to string_buffer

        // number of bytes written at the point of the last valid byte
        std::size_t bytes_after_last_accept = 0;
        std::size_t undumped_chars = 0;

        for (std::size_t i = 0; i < s.size(); ++i)
        {
            const auto byte = static_cast<uint8_t>(s[i]);

            switch (decode(state, codepoint, byte))
            {
                case UTF8_ACCEPT:  // decode found a new code point
                {
                    switch (codepoint)
                    {
                        case 0x08: // backspace
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'b';
                            break;
                        }

                        case 0x09: // horizontal tab
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 't';
                            break;
                        }

                        case 0x0A: // newline
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'n';
                            break;
                        }

                        case 0x0C: // formfeed
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'f';
                            break;
                        }

                        case 0x0D: // carriage return
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = 'r';
                            break;
                        }

                        case 0x22: // quotation mark
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = '\"';
                            break;
                        }

                        case 0x5C: // reverse solidus
                        {
                            string_buffer[bytes++] = '\\';
                            string_buffer[bytes++] = '\\';
                            break;
                        }

                        default:
                        {
                            // escape control characters (0x00..0x1F) or, if
                            // ensure_ascii parameter is used, non-ASCII characters
                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
                            {
                                if (codepoint <= 0xFFFF)
                                {
                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                                    (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
                                                    static_cast<std::uint16_t>(codepoint));
                                    bytes += 6;
                                }
                                else
                                {
                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                                    (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
                                                    static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
                                                    static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu)));
                                    bytes += 12;
                                }
                            }
                            else
                            {
                                // copy byte to buffer (all previous bytes
                                // been copied have in default case above)
                                string_buffer[bytes++] = s[i];
                            }
                            break;
                        }
                    }

                    // write buffer and reset index; there must be 13 bytes
                    // left, as this is the maximal number of bytes to be
                    // written ("\uxxxx\uxxxx\0") for one code point
                    if (string_buffer.size() - bytes < 13)
                    {
                        o->write_characters(string_buffer.data(), bytes);
                        bytes = 0;
                    }

                    // remember the byte position of this accept
                    bytes_after_last_accept = bytes;
                    undumped_chars = 0;
                    break;
                }

                case UTF8_REJECT:  // decode found invalid UTF-8 byte
                {
                    switch (error_handler)
                    {
                        case error_handler_t::strict:
                        {
                            std::string sn(3, '\0');
                            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                            (std::snprintf)(&sn[0], sn.size(), "%.2X", byte);
                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn, BasicJsonType()));
                        }

                        case error_handler_t::ignore:
                        case error_handler_t::replace:
                        {
                            // in case we saw this character the first time, we
                            // would like to read it again, because the byte
                            // may be OK for itself, but just not OK for the
                            // previous sequence
                            if (undumped_chars > 0)
                            {
                                --i;
                            }

                            // reset length buffer to the last accepted index;
                            // thus removing/ignoring the invalid characters
                            bytes = bytes_after_last_accept;

                            if (error_handler == error_handler_t::replace)
                            {
                                // add a replacement character
                                if (ensure_ascii)
                                {
                                    string_buffer[bytes++] = '\\';
                                    string_buffer[bytes++] = 'u';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'f';
                                    string_buffer[bytes++] = 'd';
                                }
                                else
                                {
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
                                }

                                // write buffer and reset index; there must be 13 bytes
                                // left, as this is the maximal number of bytes to be
                                // written ("\uxxxx\uxxxx\0") for one code point
                                if (string_buffer.size() - bytes < 13)
                                {
                                    o->write_characters(string_buffer.data(), bytes);
                                    bytes = 0;
                                }

                                bytes_after_last_accept = bytes;
                            }

                            undumped_chars = 0;

                            // continue processing the string
                            state = UTF8_ACCEPT;
                            break;
                        }

                        default:            // LCOV_EXCL_LINE
                            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                    }
                    break;
                }

                default:  // decode found yet incomplete multi-byte code point
                {
                    if (!ensure_ascii)
                    {
                        // code point will not be escaped - copy byte to buffer
                        string_buffer[bytes++] = s[i];
                    }
                    ++undumped_chars;
                    break;
                }
            }
        }

        // we finished processing the string
        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
        {
            // write buffer
            if (bytes > 0)
            {
                o->write_characters(string_buffer.data(), bytes);
            }
        }
        else
        {
            // we finish reading, but do not accept: string was incomplete
            switch (error_handler)
            {
                case error_handler_t::strict:
                {
                    std::string sn(3, '\0');
                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
                    (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast<std::uint8_t>(s.back()));
                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn, BasicJsonType()));
                }

                case error_handler_t::ignore:
                {
                    // write all accepted bytes
                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                    break;
                }

                case error_handler_t::replace:
                {
                    // write all accepted bytes
                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                    // add a replacement character
                    if (ensure_ascii)
                    {
                        o->write_characters("\\ufffd", 6);
                    }
                    else
                    {
                        o->write_characters("\xEF\xBF\xBD", 3);
                    }
                    break;
                }

                default:            // LCOV_EXCL_LINE
                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            }
        }
    }

  private:
    /*!
    @brief count digits

    Count the number of decimal (base 10) digits for an input unsigned integer.

    @param[in] x  unsigned integer number to count its digits
    @return    number of decimal digits
    */
    inline unsigned int count_digits(number_unsigned_t x) noexcept
    {
        unsigned int n_digits = 1;
        for (;;)
        {
            if (x < 10)
            {
                return n_digits;
            }
            if (x < 100)
            {
                return n_digits + 1;
            }
            if (x < 1000)
            {
                return n_digits + 2;
            }
            if (x < 10000)
            {
                return n_digits + 3;
            }
            x = x / 10000u;
            n_digits += 4;
        }
    }

    /*!
    @brief dump an integer

    Dump a given integer to output stream @a o. Works internally with
    @a number_buffer.

    @param[in] x  integer number (signed or unsigned) to dump
    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
    */
    template < typename NumberType, detail::enable_if_t <
                   std::is_same<NumberType, number_unsigned_t>::value ||
                   std::is_same<NumberType, number_integer_t>::value ||
                   std::is_same<NumberType, binary_char_t>::value,
                   int > = 0 >
    void dump_integer(NumberType x)
    {
        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
        {
            {
                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
            }
        };

        // special case for "0"
        if (x == 0)
        {
            o->write_character('0');
            return;
        }

        // use a pointer to fill the buffer
        auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)

        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
        number_unsigned_t abs_value;

        unsigned int n_chars{};

        if (is_negative)
        {
            *buffer_ptr = '-';
            abs_value = remove_sign(static_cast<number_integer_t>(x));

            // account one more byte for the minus sign
            n_chars = 1 + count_digits(abs_value);
        }
        else
        {
            abs_value = static_cast<number_unsigned_t>(x);
            n_chars = count_digits(abs_value);
        }

        // spare 1 byte for '\0'
        JSON_ASSERT(n_chars < number_buffer.size() - 1);

        // jump to the end to generate the string from backward
        // so we later avoid reversing the result
        buffer_ptr += n_chars;

        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
        while (abs_value >= 100)
        {
            const auto digits_index = static_cast<unsigned>((abs_value % 100));
            abs_value /= 100;
            *(--buffer_ptr) = digits_to_99[digits_index][1];
            *(--buffer_ptr) = digits_to_99[digits_index][0];
        }

        if (abs_value >= 10)
        {
            const auto digits_index = static_cast<unsigned>(abs_value);
            *(--buffer_ptr) = digits_to_99[digits_index][1];
            *(--buffer_ptr) = digits_to_99[digits_index][0];
        }
        else
        {
            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
        }

        o->write_characters(number_buffer.data(), n_chars);
    }

    /*!
    @brief dump a floating-point number

    Dump a given floating-point number to output stream @a o. Works internally
    with @a number_buffer.

    @param[in] x  floating-point number to dump
    */
    void dump_float(number_float_t x)
    {
        // NaN / inf
        if (!std::isfinite(x))
        {
            o->write_characters("null", 4);
            return;
        }

        // If number_float_t is an IEEE-754 single or double precision number,
        // use the Grisu2 algorithm to produce short numbers which are
        // guaranteed to round-trip, using strtof and strtod, resp.
        //
        // NB: The test below works if <long double> == <double>.
        static constexpr bool is_ieee_single_or_double
            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);

        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
    }

    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
    {
        auto* begin = number_buffer.data();
        auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);

        o->write_characters(begin, static_cast<size_t>(end - begin));
    }

    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
    {
        // get number of digits for a float -> text -> float round-trip
        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;

        // the actual conversion
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);

        // negative value indicates an error
        JSON_ASSERT(len > 0);
        // check if buffer was large enough
        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());

        // erase thousands separator
        if (thousands_sep != '\0')
        {
            auto* const end = std::remove(number_buffer.begin(),
                                          number_buffer.begin() + len, thousands_sep);
            std::fill(end, number_buffer.end(), '\0');
            JSON_ASSERT((end - number_buffer.begin()) <= len);
            len = (end - number_buffer.begin());
        }

        // convert decimal point to '.'
        if (decimal_point != '\0' && decimal_point != '.')
        {
            auto* const dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
            if (dec_pos != number_buffer.end())
            {
                *dec_pos = '.';
            }
        }

        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));

        // determine if need to append ".0"
        const bool value_is_int_like =
            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
                         [](char c)
        {
            return c == '.' || c == 'e';
        });

        if (value_is_int_like)
        {
            o->write_characters(".0", 2);
        }
    }

    /*!
    @brief check whether a string is UTF-8 encoded

    The function checks each byte of a string whether it is UTF-8 encoded. The
    result of the check is stored in the @a state parameter. The function must
    be called initially with state 0 (accept). State 1 means the string must
    be rejected, because the current byte is not allowed. If the string is
    completely processed, but the state is non-zero, the string ended
    prematurely; that is, the last byte indicated more bytes should have
    followed.

    @param[in,out] state  the state of the decoding
    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
    @param[in] byte       next byte to decode
    @return               new state

    @note The function has been edited: a std::array is used.

    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
    */
    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
    {
        static const std::array<std::uint8_t, 400> utf8d =
        {
            {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
            }
        };

        JSON_ASSERT(byte < utf8d.size());
        const std::uint8_t type = utf8d[byte];

        codep = (state != UTF8_ACCEPT)
                ? (byte & 0x3fu) | (codep << 6u)
                : (0xFFu >> type) & (byte);

        std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
        JSON_ASSERT(index < 400);
        state = utf8d[index];
        return state;
    }

    /*
     * Overload to make the compiler happy while it is instantiating
     * dump_integer for number_unsigned_t.
     * Must never be called.
     */
    number_unsigned_t remove_sign(number_unsigned_t x)
    {
        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        return x; // LCOV_EXCL_LINE
    }

    /*
     * Helper function for dump_integer
     *
     * This function takes a negative signed integer and returns its absolute
     * value as unsigned integer. The plus/minus shuffling is necessary as we can
     * not directly remove the sign of an arbitrary signed integer as the
     * absolute values of INT_MIN and INT_MAX are usually not the same. See
     * #1708 for details.
     */
    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
    {
        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
    }

  private:
    /// the output of the serializer
    output_adapter_t<char> o = nullptr;

    /// a (hopefully) large enough character buffer
    std::array<char, 64> number_buffer{{}};

    /// the locale
    const std::lconv* loc = nullptr;
    /// the locale's thousand separator character
    const char thousands_sep = '\0';
    /// the locale's decimal point character
    const char decimal_point = '\0';

    /// string buffer
    std::array<char, 512> string_buffer{{}};

    /// the indentation character
    const char indent_char;
    /// the indentation string
    string_t indent_string;

    /// error_handler how to react on decoding errors
    const error_handler_t error_handler;
};
}  // namespace detail
}  // namespace nlohmann

// #include <nlohmann/detail/value_t.hpp>

// #include <nlohmann/json_fwd.hpp>

// #include <nlohmann/ordered_map.hpp>


#include <functional> // less
#include <initializer_list> // initializer_list
#include <iterator> // input_iterator_tag, iterator_traits
#include <memory> // allocator
#include <stdexcept> // for out_of_range
#include <type_traits> // enable_if, is_convertible
#include <utility> // pair
#include <vector> // vector

// #include <nlohmann/detail/macro_scope.hpp>


namespace nlohmann
{

/// ordered_map: a minimal map-like container that preserves insertion order
/// for use within nlohmann::basic_json<ordered_map>
template <class Key, class T, class IgnoredLess = std::less<Key>,
          class Allocator = std::allocator<std::pair<const Key, T>>>
                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
{
    using key_type = Key;
    using mapped_type = T;
    using Container = std::vector<std::pair<const Key, T>, Allocator>;
    using typename Container::iterator;
    using typename Container::const_iterator;
    using typename Container::size_type;
    using typename Container::value_type;

    // Explicit constructors instead of `using Container::Container`
    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
    ordered_map(const Allocator& alloc = Allocator()) : Container{alloc} {}
    template <class It>
    ordered_map(It first, It last, const Allocator& alloc = Allocator())
        : Container{first, last, alloc} {}
    ordered_map(std::initializer_list<T> init, const Allocator& alloc = Allocator() )
        : Container{init, alloc} {}

    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return {it, false};
            }
        }
        Container::emplace_back(key, t);
        return {--this->end(), true};
    }

    T& operator[](const Key& key)
    {
        return emplace(key, T{}).first->second;
    }

    const T& operator[](const Key& key) const
    {
        return at(key);
    }

    T& at(const Key& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    const T& at(const Key& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return it->second;
            }
        }

        JSON_THROW(std::out_of_range("key not found"));
    }

    size_type erase(const Key& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                // Since we cannot move const Keys, re-construct them in place
                for (auto next = it; ++next != this->end(); ++it)
                {
                    it->~value_type(); // Destroy but keep allocation
                    new (&*it) value_type{std::move(*next)};
                }
                Container::pop_back();
                return 1;
            }
        }
        return 0;
    }

    iterator erase(iterator pos)
    {
        auto it = pos;

        // Since we cannot move const Keys, re-construct them in place
        for (auto next = it; ++next != this->end(); ++it)
        {
            it->~value_type(); // Destroy but keep allocation
            new (&*it) value_type{std::move(*next)};
        }
        Container::pop_back();
        return pos;
    }

    size_type count(const Key& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return 1;
            }
        }
        return 0;
    }

    iterator find(const Key& key)
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return it;
            }
        }
        return Container::end();
    }

    const_iterator find(const Key& key) const
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == key)
            {
                return it;
            }
        }
        return Container::end();
    }

    std::pair<iterator, bool> insert( value_type&& value )
    {
        return emplace(value.first, std::move(value.second));
    }

    std::pair<iterator, bool> insert( const value_type& value )
    {
        for (auto it = this->begin(); it != this->end(); ++it)
        {
            if (it->first == value.first)
            {
                return {it, false};
            }
        }
        Container::push_back(value);
        return {--this->end(), true};
    }

    template<typename InputIt>
    using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
            std::input_iterator_tag>::value>::type;

    template<typename InputIt, typename = require_input_iter<InputIt>>
    void insert(InputIt first, InputIt last)
    {
        for (auto it = first; it != last; ++it)
        {
            insert(*it);
        }
    }
};

}  // namespace nlohmann


#if defined(JSON_HAS_CPP_17)
    #include <string_view>
#endif

/*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
*/
namespace nlohmann
{

/*!
@brief a class to store JSON values

@tparam ObjectType type for JSON objects (`std::map` by default; will be used
in @ref object_t)
@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
in @ref array_t)
@tparam StringType type for JSON strings and object keys (`std::string` by
default; will be used in @ref string_t)
@tparam BooleanType type for JSON booleans (`bool` by default; will be used
in @ref boolean_t)
@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
default; will be used in @ref number_integer_t)
@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
`uint64_t` by default; will be used in @ref number_unsigned_t)
@tparam NumberFloatType type for JSON floating-point numbers (`double` by
default; will be used in @ref number_float_t)
@tparam BinaryType type for packed binary data for compatibility with binary
serialization formats (`std::vector<std::uint8_t>` by default; will be used in
@ref binary_t)
@tparam AllocatorType type of the allocator to use (`std::allocator` by
default)
@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
and `from_json()` (@ref adl_serializer by default)

@requirement The class satisfies the following concept requirements:
- Basic
 - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
   JSON values can be default constructed. The result will be a JSON null
   value.
 - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
   A JSON value can be constructed from an rvalue argument.
 - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
   A JSON value can be copy-constructed from an lvalue expression.
 - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
   A JSON value van be assigned from an rvalue argument.
 - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
   A JSON value can be copy-assigned from an lvalue expression.
 - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
   JSON values can be destructed.
- Layout
 - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
   JSON values have
   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
   All non-static data members are private and standard layout types, the
   class has no virtual functions or (virtual) base classes.
- Library-wide
 - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
   JSON values can be compared with `==`, see @ref
   operator==(const_reference,const_reference).
 - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
   JSON values can be compared with `<`, see @ref
   operator<(const_reference,const_reference).
 - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
   other compatible types, using unqualified function call @ref swap().
 - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
   JSON values can be compared against `std::nullptr_t` objects which are used
   to model the `null` value.
- Container
 - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
   JSON values can be used like STL containers and provide iterator access.
 - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
   JSON values can be used like STL containers and provide reverse iterator
   access.

@invariant The member variables @a m_value and @a m_type have the following
relationship:
- If `m_type == value_t::object`, then `m_value.object != nullptr`.
- If `m_type == value_t::array`, then `m_value.array != nullptr`.
- If `m_type == value_t::string`, then `m_value.string != nullptr`.
The invariants are checked by member function assert_invariant().

@internal
@note ObjectType trick from https://stackoverflow.com/a/9860911
@endinternal

@see [RFC 7159: The JavaScript Object Notation (JSON) Data Interchange
Format](http://rfc7159.net/rfc7159)

@since version 1.0.0

@nosubgrouping
*/
NLOHMANN_BASIC_JSON_TPL_DECLARATION
class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
{
  private:
    template<detail::value_t> friend struct detail::external_constructor;
    friend ::nlohmann::json_pointer<basic_json>;

    template<typename BasicJsonType, typename InputType>
    friend class ::nlohmann::detail::parser;
    friend ::nlohmann::detail::serializer<basic_json>;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::iter_impl;
    template<typename BasicJsonType, typename CharType>
    friend class ::nlohmann::detail::binary_writer;
    template<typename BasicJsonType, typename InputType, typename SAX>
    friend class ::nlohmann::detail::binary_reader;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::json_sax_dom_parser;
    template<typename BasicJsonType>
    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
    friend class ::nlohmann::detail::exception;

    /// workaround type for MSVC
    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;

  JSON_PRIVATE_UNLESS_TESTED:
    // convenience aliases for types residing in namespace detail;
    using lexer = ::nlohmann::detail::lexer_base<basic_json>;

    template<typename InputAdapterType>
    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
        InputAdapterType adapter,
        detail::parser_callback_t<basic_json>cb = nullptr,
        const bool allow_exceptions = true,
        const bool ignore_comments = false
                                 )
    {
        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
                std::move(cb), allow_exceptions, ignore_comments);
    }

  private:
    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
    template<typename BasicJsonType>
    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
    template<typename BasicJsonType>
    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
    template<typename Iterator>
    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;

    template<typename CharType>
    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;

    template<typename InputType>
    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;

  JSON_PRIVATE_UNLESS_TESTED:
    using serializer = ::nlohmann::detail::serializer<basic_json>;

  public:
    using value_t = detail::value_t;
    /// JSON Pointer, see @ref nlohmann::json_pointer
    using json_pointer = ::nlohmann::json_pointer<basic_json>;
    template<typename T, typename SFINAE>
    using json_serializer = JSONSerializer<T, SFINAE>;
    /// how to treat decoding errors
    using error_handler_t = detail::error_handler_t;
    /// how to treat CBOR tags
    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
    /// helper type for initializer lists of basic_json values
    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;

    using input_format_t = detail::input_format_t;
    /// SAX interface type, see @ref nlohmann::json_sax
    using json_sax_t = json_sax<basic_json>;

    ////////////////
    // exceptions //
    ////////////////

    /// @name exceptions
    /// Classes to implement user-defined exceptions.
    /// @{

    /// @copydoc detail::exception
    using exception = detail::exception;
    /// @copydoc detail::parse_error
    using parse_error = detail::parse_error;
    /// @copydoc detail::invalid_iterator
    using invalid_iterator = detail::invalid_iterator;
    /// @copydoc detail::type_error
    using type_error = detail::type_error;
    /// @copydoc detail::out_of_range
    using out_of_range = detail::out_of_range;
    /// @copydoc detail::other_error
    using other_error = detail::other_error;

    /// @}


    /////////////////////
    // container types //
    /////////////////////

    /// @name container types
    /// The canonic container types to use @ref basic_json like any other STL
    /// container.
    /// @{

    /// the type of elements in a basic_json container
    using value_type = basic_json;

    /// the type of an element reference
    using reference = value_type&;
    /// the type of an element const reference
    using const_reference = const value_type&;

    /// a type to represent differences between iterators
    using difference_type = std::ptrdiff_t;
    /// a type to represent container sizes
    using size_type = std::size_t;

    /// the allocator type
    using allocator_type = AllocatorType<basic_json>;

    /// the type of an element pointer
    using pointer = typename std::allocator_traits<allocator_type>::pointer;
    /// the type of an element const pointer
    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;

    /// an iterator for a basic_json container
    using iterator = iter_impl<basic_json>;
    /// a const iterator for a basic_json container
    using const_iterator = iter_impl<const basic_json>;
    /// a reverse iterator for a basic_json container
    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
    /// a const reverse iterator for a basic_json container
    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;

    /// @}


    /*!
    @brief returns the allocator associated with the container
    */
    static allocator_type get_allocator()
    {
        return allocator_type();
    }

    /*!
    @brief returns version information on the library

    This function returns a JSON object with information about the library,
    including the version number and information on the platform and compiler.

    @return JSON object holding version information
    key         | description
    ----------- | ---------------
    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
    `copyright` | The copyright line for the library as string.
    `name`      | The name of the library as string.
    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
    `url`       | The URL of the project as string.
    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).

    @liveexample{The following code shows an example output of the `meta()`
    function.,meta}

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @complexity Constant.

    @since 2.1.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json meta()
    {
        basic_json result;

        result["copyright"] = "(C) 2013-2021 Niels Lohmann";
        result["name"] = "JSON for Modern C++";
        result["url"] = "https://github.com/nlohmann/json";
        result["version"]["string"] =
            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;

#ifdef _WIN32
        result["platform"] = "win32";
#elif defined __linux__
        result["platform"] = "linux";
#elif defined __APPLE__
        result["platform"] = "apple";
#elif defined __unix__
        result["platform"] = "unix";
#else
        result["platform"] = "unknown";
#endif

#if defined(__ICC) || defined(__INTEL_COMPILER)
        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
#elif defined(__clang__)
        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
#elif defined(__GNUC__) || defined(__GNUG__)
        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
#elif defined(__HP_cc) || defined(__HP_aCC)
        result["compiler"] = "hp"
#elif defined(__IBMCPP__)
        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
#elif defined(_MSC_VER)
        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
#elif defined(__PGI)
        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
#elif defined(__SUNPRO_CC)
        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
#else
        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
#endif

#ifdef __cplusplus
        result["compiler"]["c++"] = std::to_string(__cplusplus);
#else
        result["compiler"]["c++"] = "unknown";
#endif
        return result;
    }


    ///////////////////////////
    // JSON value data types //
    ///////////////////////////

    /// @name JSON value data types
    /// The data types to store a JSON value. These types are derived from
    /// the template arguments passed to class @ref basic_json.
    /// @{

#if defined(JSON_HAS_CPP_14)
    // Use transparent comparator if possible, combined with perfect forwarding
    // on find() and count() calls prevents unnecessary string construction.
    using object_comparator_t = std::less<>;
#else
    using object_comparator_t = std::less<StringType>;
#endif

    /*!
    @brief a type for an object

    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
    > An object is an unordered collection of zero or more name/value pairs,
    > where a name is a string and a value is a string, number, boolean, null,
    > object, or array.

    To store objects in C++, a type is defined by the template parameters
    described below.

    @tparam ObjectType  the container to store objects (e.g., `std::map` or
    `std::unordered_map`)
    @tparam StringType the type of the keys or names (e.g., `std::string`).
    The comparison function `std::less<StringType>` is used to order elements
    inside the container.
    @tparam AllocatorType the allocator to use for objects (e.g.,
    `std::allocator`)

    #### Default type

    With the default values for @a ObjectType (`std::map`), @a StringType
    (`std::string`), and @a AllocatorType (`std::allocator`), the default
    value for @a object_t is:

    @code {.cpp}
    std::map<
      std::string, // key_type
      basic_json, // value_type
      std::less<std::string>, // key_compare
      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
    >
    @endcode

    #### Behavior

    The choice of @a object_t influences the behavior of the JSON class. With
    the default type, objects have the following behavior:

    - When all names are unique, objects will be interoperable in the sense
      that all software implementations receiving that object will agree on
      the name-value mappings.
    - When the names within an object are not unique, it is unspecified which
      one of the values for a given key will be chosen. For instance,
      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
      `{"key": 2}`.
    - Internally, name/value pairs are stored in lexicographical order of the
      names. Objects will also be serialized (see @ref dump) in this order.
      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
      and serialized as `{"a": 2, "b": 1}`.
    - When comparing objects, the order of the name/value pairs is irrelevant.
      This makes objects interoperable in the sense that they will not be
      affected by these differences. For instance, `{"b": 1, "a": 2}` and
      `{"a": 2, "b": 1}` will be treated as equal.

    #### Limits

    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
    > An implementation may set limits on the maximum depth of nesting.

    In this class, the object's limit of nesting is not explicitly constrained.
    However, a maximum depth of nesting may be introduced by the compiler or
    runtime environment. A theoretical limit can be queried by calling the
    @ref max_size function of a JSON object.

    #### Storage

    Objects are stored as pointers in a @ref basic_json type. That is, for any
    access to object values, a pointer of type `object_t*` must be
    dereferenced.

    @sa see @ref array_t -- type for an array value

    @since version 1.0.0

    @note The order name/value pairs are added to the object is *not*
    preserved by the library. Therefore, iterating an object may return
    name/value pairs in a different order than they were originally stored. In
    fact, keys will be traversed in alphabetical order as `std::map` with
    `std::less` is used by default. Please note this behavior conforms to [RFC
    7159](http://rfc7159.net/rfc7159), because any order implements the
    specified "unordered" nature of JSON objects.
    */
    using object_t = ObjectType<StringType,
          basic_json,
          object_comparator_t,
          AllocatorType<std::pair<const StringType,
          basic_json>>>;

    /*!
    @brief a type for an array

    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
    > An array is an ordered sequence of zero or more values.

    To store objects in C++, a type is defined by the template parameters
    explained below.

    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
    `std::list`)
    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)

    #### Default type

    With the default values for @a ArrayType (`std::vector`) and @a
    AllocatorType (`std::allocator`), the default value for @a array_t is:

    @code {.cpp}
    std::vector<
      basic_json, // value_type
      std::allocator<basic_json> // allocator_type
    >
    @endcode

    #### Limits

    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
    > An implementation may set limits on the maximum depth of nesting.

    In this class, the array's limit of nesting is not explicitly constrained.
    However, a maximum depth of nesting may be introduced by the compiler or
    runtime environment. A theoretical limit can be queried by calling the
    @ref max_size function of a JSON array.

    #### Storage

    Arrays are stored as pointers in a @ref basic_json type. That is, for any
    access to array values, a pointer of type `array_t*` must be dereferenced.

    @sa see @ref object_t -- type for an object value

    @since version 1.0.0
    */
    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;

    /*!
    @brief a type for a string

    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
    > A string is a sequence of zero or more Unicode characters.

    To store objects in C++, a type is defined by the template parameter
    described below. Unicode values are split by the JSON class into
    byte-sized characters during deserialization.

    @tparam StringType  the container to store strings (e.g., `std::string`).
    Note this container is used for keys/names in objects, see @ref object_t.

    #### Default type

    With the default values for @a StringType (`std::string`), the default
    value for @a string_t is:

    @code {.cpp}
    std::string
    @endcode

    #### Encoding

    Strings are stored in UTF-8 encoding. Therefore, functions like
    `std::string::size()` or `std::string::length()` return the number of
    bytes in the string rather than the number of characters or glyphs.

    #### String comparison

    [RFC 7159](http://rfc7159.net/rfc7159) states:
    > Software implementations are typically required to test names of object
    > members for equality. Implementations that transform the textual
    > representation into sequences of Unicode code units and then perform the
    > comparison numerically, code unit by code unit, are interoperable in the
    > sense that implementations will agree in all cases on equality or
    > inequality of two strings. For example, implementations that compare
    > strings with escaped characters unconverted may incorrectly find that
    > `"a\\b"` and `"a\u005Cb"` are not equal.

    This implementation is interoperable as it does compare strings code unit
    by code unit.

    #### Storage

    String values are stored as pointers in a @ref basic_json type. That is,
    for any access to string values, a pointer of type `string_t*` must be
    dereferenced.

    @since version 1.0.0
    */
    using string_t = StringType;

    /*!
    @brief a type for a boolean

    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
    type which differentiates the two literals `true` and `false`.

    To store objects in C++, a type is defined by the template parameter @a
    BooleanType which chooses the type to use.

    #### Default type

    With the default values for @a BooleanType (`bool`), the default value for
    @a boolean_t is:

    @code {.cpp}
    bool
    @endcode

    #### Storage

    Boolean values are stored directly inside a @ref basic_json type.

    @since version 1.0.0
    */
    using boolean_t = BooleanType;

    /*!
    @brief a type for a number (integer)

    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
    > The representation of numbers is similar to that used in most
    > programming languages. A number is represented in base 10 using decimal
    > digits. It contains an integer component that may be prefixed with an
    > optional minus sign, which may be followed by a fraction part and/or an
    > exponent part. Leading zeros are not allowed. (...) Numeric values that
    > cannot be represented in the grammar below (such as Infinity and NaN)
    > are not permitted.

    This description includes both integer and floating-point numbers.
    However, C++ allows more precise storage if it is known whether the number
    is a signed integer, an unsigned integer or a floating-point number.
    Therefore, three different types, @ref number_integer_t, @ref
    number_unsigned_t and @ref number_float_t are used.

    To store integer numbers in C++, a type is defined by the template
    parameter @a NumberIntegerType which chooses the type to use.

    #### Default type

    With the default values for @a NumberIntegerType (`int64_t`), the default
    value for @a number_integer_t is:

    @code {.cpp}
    int64_t
    @endcode

    #### Default behavior

    - The restrictions about leading zeros is not enforced in C++. Instead,
      leading zeros in integer literals lead to an interpretation as octal
      number. Internally, the value will be stored as decimal number. For
      instance, the C++ integer literal `010` will be serialized to `8`.
      During deserialization, leading zeros yield an error.
    - Not-a-number (NaN) values will be serialized to `null`.

    #### Limits

    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
    > An implementation may set limits on the range and precision of numbers.

    When the default type is used, the maximal integer number that can be
    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
    that are out of range will yield over/underflow when used in a
    constructor. During deserialization, too large or small integer numbers
    will be automatically be stored as @ref number_unsigned_t or @ref
    number_float_t.

    [RFC 7159](http://rfc7159.net/rfc7159) further states:
    > Note that when such software is used, numbers that are integers and are
    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
    > that implementations will agree exactly on their numeric values.

    As this range is a subrange of the exactly supported range [INT64_MIN,
    INT64_MAX], this class's integer type is interoperable.

    #### Storage

    Integer number values are stored directly inside a @ref basic_json type.

    @sa see @ref number_float_t -- type for number values (floating-point)

    @sa see @ref number_unsigned_t -- type for number values (unsigned integer)

    @since version 1.0.0
    */
    using number_integer_t = NumberIntegerType;

    /*!
    @brief a type for a number (unsigned)

    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
    > The representation of numbers is similar to that used in most
    > programming languages. A number is represented in base 10 using decimal
    > digits. It contains an integer component that may be prefixed with an
    > optional minus sign, which may be followed by a fraction part and/or an
    > exponent part. Leading zeros are not allowed. (...) Numeric values that
    > cannot be represented in the grammar below (such as Infinity and NaN)
    > are not permitted.

    This description includes both integer and floating-point numbers.
    However, C++ allows more precise storage if it is known whether the number
    is a signed integer, an unsigned integer or a floating-point number.
    Therefore, three different types, @ref number_integer_t, @ref
    number_unsigned_t and @ref number_float_t are used.

    To store unsigned integer numbers in C++, a type is defined by the
    template parameter @a NumberUnsignedType which chooses the type to use.

    #### Default type

    With the default values for @a NumberUnsignedType (`uint64_t`), the
    default value for @a number_unsigned_t is:

    @code {.cpp}
    uint64_t
    @endcode

    #### Default behavior

    - The restrictions about leading zeros is not enforced in C++. Instead,
      leading zeros in integer literals lead to an interpretation as octal
      number. Internally, the value will be stored as decimal number. For
      instance, the C++ integer literal `010` will be serialized to `8`.
      During deserialization, leading zeros yield an error.
    - Not-a-number (NaN) values will be serialized to `null`.

    #### Limits

    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
    > An implementation may set limits on the range and precision of numbers.

    When the default type is used, the maximal integer number that can be
    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
    number that can be stored is `0`. Integer numbers that are out of range
    will yield over/underflow when used in a constructor. During
    deserialization, too large or small integer numbers will be automatically
    be stored as @ref number_integer_t or @ref number_float_t.

    [RFC 7159](http://rfc7159.net/rfc7159) further states:
    > Note that when such software is used, numbers that are integers and are
    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
    > that implementations will agree exactly on their numeric values.

    As this range is a subrange (when considered in conjunction with the
    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
    this class's integer type is interoperable.

    #### Storage

    Integer number values are stored directly inside a @ref basic_json type.

    @sa see @ref number_float_t -- type for number values (floating-point)
    @sa see @ref number_integer_t -- type for number values (integer)

    @since version 2.0.0
    */
    using number_unsigned_t = NumberUnsignedType;

    /*!
    @brief a type for a number (floating-point)

    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
    > The representation of numbers is similar to that used in most
    > programming languages. A number is represented in base 10 using decimal
    > digits. It contains an integer component that may be prefixed with an
    > optional minus sign, which may be followed by a fraction part and/or an
    > exponent part. Leading zeros are not allowed. (...) Numeric values that
    > cannot be represented in the grammar below (such as Infinity and NaN)
    > are not permitted.

    This description includes both integer and floating-point numbers.
    However, C++ allows more precise storage if it is known whether the number
    is a signed integer, an unsigned integer or a floating-point number.
    Therefore, three different types, @ref number_integer_t, @ref
    number_unsigned_t and @ref number_float_t are used.

    To store floating-point numbers in C++, a type is defined by the template
    parameter @a NumberFloatType which chooses the type to use.

    #### Default type

    With the default values for @a NumberFloatType (`double`), the default
    value for @a number_float_t is:

    @code {.cpp}
    double
    @endcode

    #### Default behavior

    - The restrictions about leading zeros is not enforced in C++. Instead,
      leading zeros in floating-point literals will be ignored. Internally,
      the value will be stored as decimal number. For instance, the C++
      floating-point literal `01.2` will be serialized to `1.2`. During
      deserialization, leading zeros yield an error.
    - Not-a-number (NaN) values will be serialized to `null`.

    #### Limits

    [RFC 7159](http://rfc7159.net/rfc7159) states:
    > This specification allows implementations to set limits on the range and
    > precision of numbers accepted. Since software that implements IEEE
    > 754-2008 binary64 (double precision) numbers is generally available and
    > widely used, good interoperability can be achieved by implementations
    > that expect no more precision or range than these provide, in the sense
    > that implementations will approximate JSON numbers within the expected
    > precision.

    This implementation does exactly follow this approach, as it uses double
    precision floating-point numbers. Note values smaller than
    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
    will be stored as NaN internally and be serialized to `null`.

    #### Storage

    Floating-point number values are stored directly inside a @ref basic_json
    type.

    @sa see @ref number_integer_t -- type for number values (integer)

    @sa see @ref number_unsigned_t -- type for number values (unsigned integer)

    @since version 1.0.0
    */
    using number_float_t = NumberFloatType;

    /*!
    @brief a type for a packed binary type

    This type is a type designed to carry binary data that appears in various
    serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and
    BSON's generic binary subtype. This type is NOT a part of standard JSON and
    exists solely for compatibility with these binary types. As such, it is
    simply defined as an ordered sequence of zero or more byte values.

    Additionally, as an implementation detail, the subtype of the binary data is
    carried around as a `std::uint8_t`, which is compatible with both of the
    binary data formats that use binary subtyping, (though the specific
    numbering is incompatible with each other, and it is up to the user to
    translate between them).

    [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type
    as:
    > Major type 2: a byte string. The string's length in bytes is represented
    > following the rules for positive integers (major type 0).

    [MessagePack's documentation on the bin type
    family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family)
    describes this type as:
    > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes
    > in addition to the size of the byte array.

    [BSON's specifications](http://bsonspec.org/spec.html) describe several
    binary types; however, this type is intended to represent the generic binary
    type which has the description:
    > Generic binary subtype - This is the most commonly used binary subtype and
    > should be the 'default' for drivers and tools.

    None of these impose any limitations on the internal representation other
    than the basic unit of storage be some type of array whose parts are
    decomposable into bytes.

    The default representation of this binary format is a
    `std::vector<std::uint8_t>`, which is a very common way to represent a byte
    array in modern C++.

    #### Default type

    The default values for @a BinaryType is `std::vector<std::uint8_t>`

    #### Storage

    Binary Arrays are stored as pointers in a @ref basic_json type. That is,
    for any access to array values, a pointer of the type `binary_t*` must be
    dereferenced.

    #### Notes on subtypes

    - CBOR
       - Binary values are represented as byte strings. No subtypes are
         supported and will be ignored when CBOR is written.
    - MessagePack
       - If a subtype is given and the binary array contains exactly 1, 2, 4, 8,
         or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8)
         is used. For other sizes, the ext family (ext8, ext16, ext32) is used.
         The subtype is then added as singed 8-bit integer.
       - If no subtype is given, the bin family (bin8, bin16, bin32) is used.
    - BSON
       - If a subtype is given, it is used and added as unsigned 8-bit integer.
       - If no subtype is given, the generic binary subtype 0x00 is used.

    @sa see @ref binary -- create a binary array

    @since version 3.8.0
    */
    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
    /// @}

  private:

    /// helper for exception-safe object creation
    template<typename T, typename... Args>
    JSON_HEDLEY_RETURNS_NON_NULL
    static T* create(Args&& ... args)
    {
        AllocatorType<T> alloc;
        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;

        auto deleter = [&](T * obj)
        {
            AllocatorTraits::deallocate(alloc, obj, 1);
        };
        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
        JSON_ASSERT(obj != nullptr);
        return obj.release();
    }

    ////////////////////////
    // JSON value storage //
    ////////////////////////

  JSON_PRIVATE_UNLESS_TESTED:
    /*!
    @brief a JSON value

    The actual storage for a JSON value of the @ref basic_json class. This
    union combines the different storage types for the JSON value types
    defined in @ref value_t.

    JSON type | value_t type    | used type
    --------- | --------------- | ------------------------
    object    | object          | pointer to @ref object_t
    array     | array           | pointer to @ref array_t
    string    | string          | pointer to @ref string_t
    boolean   | boolean         | @ref boolean_t
    number    | number_integer  | @ref number_integer_t
    number    | number_unsigned | @ref number_unsigned_t
    number    | number_float    | @ref number_float_t
    binary    | binary          | pointer to @ref binary_t
    null      | null            | *no value is stored*

    @note Variable-length types (objects, arrays, and strings) are stored as
    pointers. The size of the union should not exceed 64 bits if the default
    value types are used.

    @since version 1.0.0
    */
    union json_value
    {
        /// object (stored with pointer to save storage)
        object_t* object;
        /// array (stored with pointer to save storage)
        array_t* array;
        /// string (stored with pointer to save storage)
        string_t* string;
        /// binary (stored with pointer to save storage)
        binary_t* binary;
        /// boolean
        boolean_t boolean;
        /// number (integer)
        number_integer_t number_integer;
        /// number (unsigned integer)
        number_unsigned_t number_unsigned;
        /// number (floating-point)
        number_float_t number_float;

        /// default constructor (for null values)
        json_value() = default;
        /// constructor for booleans
        json_value(boolean_t v) noexcept : boolean(v) {}
        /// constructor for numbers (integer)
        json_value(number_integer_t v) noexcept : number_integer(v) {}
        /// constructor for numbers (unsigned)
        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
        /// constructor for numbers (floating-point)
        json_value(number_float_t v) noexcept : number_float(v) {}
        /// constructor for empty values of a given type
        json_value(value_t t)
        {
            switch (t)
            {
                case value_t::object:
                {
                    object = create<object_t>();
                    break;
                }

                case value_t::array:
                {
                    array = create<array_t>();
                    break;
                }

                case value_t::string:
                {
                    string = create<string_t>("");
                    break;
                }

                case value_t::binary:
                {
                    binary = create<binary_t>();
                    break;
                }

                case value_t::boolean:
                {
                    boolean = boolean_t(false);
                    break;
                }

                case value_t::number_integer:
                {
                    number_integer = number_integer_t(0);
                    break;
                }

                case value_t::number_unsigned:
                {
                    number_unsigned = number_unsigned_t(0);
                    break;
                }

                case value_t::number_float:
                {
                    number_float = number_float_t(0.0);
                    break;
                }

                case value_t::null:
                {
                    object = nullptr;  // silence warning, see #821
                    break;
                }

                default:
                {
                    object = nullptr;  // silence warning, see #821
                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
                    {
                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.9.1", basic_json())); // LCOV_EXCL_LINE
                    }
                    break;
                }
            }
        }

        /// constructor for strings
        json_value(const string_t& value)
        {
            string = create<string_t>(value);
        }

        /// constructor for rvalue strings
        json_value(string_t&& value)
        {
            string = create<string_t>(std::move(value));
        }

        /// constructor for objects
        json_value(const object_t& value)
        {
            object = create<object_t>(value);
        }

        /// constructor for rvalue objects
        json_value(object_t&& value)
        {
            object = create<object_t>(std::move(value));
        }

        /// constructor for arrays
        json_value(const array_t& value)
        {
            array = create<array_t>(value);
        }

        /// constructor for rvalue arrays
        json_value(array_t&& value)
        {
            array = create<array_t>(std::move(value));
        }

        /// constructor for binary arrays
        json_value(const typename binary_t::container_type& value)
        {
            binary = create<binary_t>(value);
        }

        /// constructor for rvalue binary arrays
        json_value(typename binary_t::container_type&& value)
        {
            binary = create<binary_t>(std::move(value));
        }

        /// constructor for binary arrays (internal type)
        json_value(const binary_t& value)
        {
            binary = create<binary_t>(value);
        }

        /// constructor for rvalue binary arrays (internal type)
        json_value(binary_t&& value)
        {
            binary = create<binary_t>(std::move(value));
        }

        void destroy(value_t t) noexcept
        {
            // flatten the current json_value to a heap-allocated stack
            std::vector<basic_json> stack;

            // move the top-level items to stack
            if (t == value_t::array)
            {
                stack.reserve(array->size());
                std::move(array->begin(), array->end(), std::back_inserter(stack));
            }
            else if (t == value_t::object)
            {
                stack.reserve(object->size());
                for (auto&& it : *object)
                {
                    stack.push_back(std::move(it.second));
                }
            }

            while (!stack.empty())
            {
                // move the last item to local variable to be processed
                basic_json current_item(std::move(stack.back()));
                stack.pop_back();

                // if current_item is array/object, move
                // its children to the stack to be processed later
                if (current_item.is_array())
                {
                    std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(),
                              std::back_inserter(stack));

                    current_item.m_value.array->clear();
                }
                else if (current_item.is_object())
                {
                    for (auto&& it : *current_item.m_value.object)
                    {
                        stack.push_back(std::move(it.second));
                    }

                    current_item.m_value.object->clear();
                }

                // it's now safe that current_item get destructed
                // since it doesn't have any children
            }

            switch (t)
            {
                case value_t::object:
                {
                    AllocatorType<object_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
                    break;
                }

                case value_t::array:
                {
                    AllocatorType<array_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
                    break;
                }

                case value_t::string:
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
                    break;
                }

                case value_t::binary:
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
                    break;
                }

                default:
                {
                    break;
                }
            }
        }
    };

  private:
    /*!
    @brief checks the class invariants

    This function asserts the class invariants. It needs to be called at the
    end of every constructor to make sure that created objects respect the
    invariant. Furthermore, it has to be called each time the type of a JSON
    value is changed, because the invariant expresses a relationship between
    @a m_type and @a m_value.

    Furthermore, the parent relation is checked for arrays and objects: If
    @a check_parents true and the value is an array or object, then the
    container's elements must have the current value as parent.

    @param[in] check_parents  whether the parent relation should be checked.
               The value is true by default and should only be set to false
               during destruction of objects when the invariant does not
               need to hold.
    */
    void assert_invariant(bool check_parents = true) const noexcept
    {
        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);

#if JSON_DIAGNOSTICS
        JSON_TRY
        {
            // cppcheck-suppress assertWithSideEffect
            JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j)
            {
                return j.m_parent == this;
            }));
        }
        JSON_CATCH(...) {} // LCOV_EXCL_LINE
#else
        static_cast<void>(check_parents);
#endif
    }

    void set_parents()
    {
#if JSON_DIAGNOSTICS
        switch (m_type)
        {
            case value_t::array:
            {
                for (auto& element : *m_value.array)
                {
                    element.m_parent = this;
                }
                break;
            }

            case value_t::object:
            {
                for (auto& element : *m_value.object)
                {
                    element.second.m_parent = this;
                }
                break;
            }

            default:
                break;
        }
#endif
    }

    iterator set_parents(iterator it, typename iterator::difference_type count)
    {
#if JSON_DIAGNOSTICS
        for (typename iterator::difference_type i = 0; i < count; ++i)
        {
            (it + i)->m_parent = this;
        }
#else
        static_cast<void>(count);
#endif
        return it;
    }

    reference set_parent(reference j)
    {
#if JSON_DIAGNOSTICS
        j.m_parent = this;
#else
        static_cast<void>(j);
#endif
        return j;
    }

  public:
    //////////////////////////
    // JSON parser callback //
    //////////////////////////

    /*!
    @brief parser event types

    The parser callback distinguishes the following events:
    - `object_start`: the parser read `{` and started to process a JSON object
    - `key`: the parser read a key of a value in an object
    - `object_end`: the parser read `}` and finished processing a JSON object
    - `array_start`: the parser read `[` and started to process a JSON array
    - `array_end`: the parser read `]` and finished processing a JSON array
    - `value`: the parser finished reading a JSON value

    @image html callback_events.png "Example when certain parse events are triggered"

    @sa see @ref parser_callback_t for more information and examples
    */
    using parse_event_t = detail::parse_event_t;

    /*!
    @brief per-element parser callback type

    With a parser callback function, the result of parsing a JSON text can be
    influenced. When passed to @ref parse, it is called on certain events
    (passed as @ref parse_event_t via parameter @a event) with a set recursion
    depth @a depth and context JSON value @a parsed. The return value of the
    callback function is a boolean indicating whether the element that emitted
    the callback shall be kept or not.

    We distinguish six scenarios (determined by the event type) in which the
    callback function can be called. The following table describes the values
    of the parameters @a depth, @a event, and @a parsed.

    parameter @a event | description | parameter @a depth | parameter @a parsed
    ------------------ | ----------- | ------------------ | -------------------
    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value

    @image html callback_events.png "Example when certain parse events are triggered"

    Discarding a value (i.e., returning `false`) has different effects
    depending on the context in which function was called:

    - Discarded values in structured types are skipped. That is, the parser
      will behave as if the discarded value was never read.
    - In case a value outside a structured type is skipped, it is replaced
      with `null`. This case happens if the top-level element is skipped.

    @param[in] depth  the depth of the recursion during parsing

    @param[in] event  an event of type parse_event_t indicating the context in
    the callback function has been called

    @param[in,out] parsed  the current intermediate parse result; note that
    writing to this value has no effect for parse_event_t::key events

    @return Whether the JSON value which called the function during parsing
    should be kept (`true`) or not (`false`). In the latter case, it is either
    skipped completely or replaced by an empty discarded object.

    @sa see @ref parse for examples

    @since version 1.0.0
    */
    using parser_callback_t = detail::parser_callback_t<basic_json>;

    //////////////////
    // constructors //
    //////////////////

    /// @name constructors and destructors
    /// Constructors of class @ref basic_json, copy/move constructor, copy
    /// assignment, static functions creating objects, and the destructor.
    /// @{

    /*!
    @brief create an empty value with a given type

    Create an empty JSON value with a given type. The value will be default
    initialized with an empty value which depends on the type:

    Value type  | initial value
    ----------- | -------------
    null        | `null`
    boolean     | `false`
    string      | `""`
    number      | `0`
    object      | `{}`
    array       | `[]`
    binary      | empty array

    @param[in] v  the type of the value to create

    @complexity Constant.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The following code shows the constructor for different @ref
    value_t values,basic_json__value_t}

    @sa see @ref clear() -- restores the postcondition of this constructor

    @since version 1.0.0
    */
    basic_json(const value_t v)
        : m_type(v), m_value(v)
    {
        assert_invariant();
    }

    /*!
    @brief create a null object

    Create a `null` JSON value. It either takes a null pointer as parameter
    (explicitly creating `null`) or no parameter (implicitly creating `null`).
    The passed null pointer itself is not read -- it is only used to choose
    the right constructor.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this constructor never throws
    exceptions.

    @liveexample{The following code shows the constructor with and without a
    null pointer parameter.,basic_json__nullptr_t}

    @since version 1.0.0
    */
    basic_json(std::nullptr_t = nullptr) noexcept
        : basic_json(value_t::null)
    {
        assert_invariant();
    }

    /*!
    @brief create a JSON value

    This is a "catch all" constructor for all compatible JSON types; that is,
    types for which a `to_json()` method exists. The constructor forwards the
    parameter @a val to that method (to `json_serializer<U>::to_json` method
    with `U = uncvref_t<CompatibleType>`, to be exact).

    Template type @a CompatibleType includes, but is not limited to, the
    following types:
    - **arrays**: @ref array_t and all kinds of compatible containers such as
      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
      which a @ref basic_json value can be constructed.
    - **objects**: @ref object_t and all kinds of compatible associative
      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
      and `std::unordered_multimap` with a `key_type` compatible to
      @ref string_t and a `value_type` from which a @ref basic_json value can
      be constructed.
    - **strings**: @ref string_t, string literals, and all compatible string
      containers can be used.
    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
      @ref number_float_t, and all convertible number types such as `int`,
      `size_t`, `int64_t`, `float` or `double` can be used.
    - **boolean**: @ref boolean_t / `bool` can be used.
    - **binary**: @ref binary_t / `std::vector<uint8_t>` may be used,
      unfortunately because string literals cannot be distinguished from binary
      character arrays by the C++ type system, all types compatible with `const
      char*` will be directed to the string constructor instead.  This is both
      for backwards compatibility, and due to the fact that a binary type is not
      a standard JSON type.

    See the examples below.

    @tparam CompatibleType a type such that:
    - @a CompatibleType is not derived from `std::istream`,
    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
         constructors),
    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
         @ref json_pointer, @ref iterator, etc ...)
    - `json_serializer<U>` has a `to_json(basic_json_t&, CompatibleType&&)` method

    @tparam U = `uncvref_t<CompatibleType>`

    @param[in] val the value to be forwarded to the respective constructor

    @complexity Usually linear in the size of the passed @a val, also
                depending on the implementation of the called `to_json()`
                method.

    @exceptionsafety Depends on the called constructor. For types directly
    supported by the library (i.e., all types for which no `to_json()` function
    was provided), strong guarantee holds: if an exception is thrown, there are
    no changes to any JSON value.

    @liveexample{The following code shows the constructor with several
    compatible types.,basic_json__CompatibleType}

    @since version 2.1.0
    */
    template < typename CompatibleType,
               typename U = detail::uncvref_t<CompatibleType>,
               detail::enable_if_t <
                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
    basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
                                           std::forward<CompatibleType>(val))))
    {
        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
        set_parents();
        assert_invariant();
    }

    /*!
    @brief create a JSON value from an existing one

    This is a constructor for existing @ref basic_json types.
    It does not hijack copy/move constructors, since the parameter has different
    template arguments than the current ones.

    The constructor tries to convert the internal @ref m_value of the parameter.

    @tparam BasicJsonType a type such that:
    - @a BasicJsonType is a @ref basic_json type.
    - @a BasicJsonType has different template arguments than @ref basic_json_t.

    @param[in] val the @ref basic_json value to be converted.

    @complexity Usually linear in the size of the passed @a val, also
                depending on the implementation of the called `to_json()`
                method.

    @exceptionsafety Depends on the called constructor. For types directly
    supported by the library (i.e., all types for which no `to_json()` function
    was provided), strong guarantee holds: if an exception is thrown, there are
    no changes to any JSON value.

    @since version 3.2.0
    */
    template < typename BasicJsonType,
               detail::enable_if_t <
                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
    basic_json(const BasicJsonType& val)
    {
        using other_boolean_t = typename BasicJsonType::boolean_t;
        using other_number_float_t = typename BasicJsonType::number_float_t;
        using other_number_integer_t = typename BasicJsonType::number_integer_t;
        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
        using other_string_t = typename BasicJsonType::string_t;
        using other_object_t = typename BasicJsonType::object_t;
        using other_array_t = typename BasicJsonType::array_t;
        using other_binary_t = typename BasicJsonType::binary_t;

        switch (val.type())
        {
            case value_t::boolean:
                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
                break;
            case value_t::number_float:
                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
                break;
            case value_t::number_integer:
                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
                break;
            case value_t::number_unsigned:
                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
                break;
            case value_t::string:
                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
                break;
            case value_t::object:
                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
                break;
            case value_t::array:
                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
                break;
            case value_t::binary:
                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
                break;
            case value_t::null:
                *this = nullptr;
                break;
            case value_t::discarded:
                m_type = value_t::discarded;
                break;
            default:            // LCOV_EXCL_LINE
                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
        }
        set_parents();
        assert_invariant();
    }

    /*!
    @brief create a container (array or object) from an initializer list

    Creates a JSON value of type array or object from the passed initializer
    list @a init. In case @a type_deduction is `true` (default), the type of
    the JSON value to be created is deducted from the initializer list @a init
    according to the following rules:

    1. If the list is empty, an empty JSON object value `{}` is created.
    2. If the list consists of pairs whose first element is a string, a JSON
       object value is created where the first elements of the pairs are
       treated as keys and the second elements are as values.
    3. In all other cases, an array is created.

    The rules aim to create the best fit between a C++ initializer list and
    JSON values. The rationale is as follows:

    1. The empty initializer list is written as `{}` which is exactly an empty
       JSON object.
    2. C++ has no way of describing mapped types other than to list a list of
       pairs. As JSON requires that keys must be of type string, rule 2 is the
       weakest constraint one can pose on initializer lists to interpret them
       as an object.
    3. In all other cases, the initializer list could not be interpreted as
       JSON object type, so interpreting it as JSON array type is safe.

    With the rules described above, the following JSON values cannot be
    expressed by an initializer list:

    - the empty array (`[]`): use @ref array(initializer_list_t)
      with an empty initializer list in this case
    - arrays whose elements satisfy rule 2: use @ref
      array(initializer_list_t) with the same initializer list
      in this case

    @note When used without parentheses around an empty initializer list, @ref
    basic_json() is called instead of this function, yielding the JSON null
    value.

    @param[in] init  initializer list with JSON values

    @param[in] type_deduction internal parameter; when set to `true`, the type
    of the JSON value is deducted from the initializer list @a init; when set
    to `false`, the type provided via @a manual_type is forced. This mode is
    used by the functions @ref array(initializer_list_t) and
    @ref object(initializer_list_t).

    @param[in] manual_type internal parameter; when @a type_deduction is set
    to `false`, the created JSON value will use the provided type (only @ref
    value_t::array and @ref value_t::object are valid); when @a type_deduction
    is set to `true`, this parameter has no effect

    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
    `value_t::object`, but @a init contains an element which is not a pair
    whose first element is a string. In this case, the constructor could not
    create an object. If @a type_deduction would have be `true`, an array
    would have been created. See @ref object(initializer_list_t)
    for an example.

    @complexity Linear in the size of the initializer list @a init.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The example below shows how JSON values are created from
    initializer lists.,basic_json__list_init_t}

    @sa see @ref array(initializer_list_t) -- create a JSON array
    value from an initializer list
    @sa see @ref object(initializer_list_t) -- create a JSON object
    value from an initializer list

    @since version 1.0.0
    */
    basic_json(initializer_list_t init,
               bool type_deduction = true,
               value_t manual_type = value_t::array)
    {
        // check if each element is an array with two elements whose first
        // element is a string
        bool is_an_object = std::all_of(init.begin(), init.end(),
                                        [](const detail::json_ref<basic_json>& element_ref)
        {
            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
        });

        // adjust type if type deduction is not wanted
        if (!type_deduction)
        {
            // if array is wanted, do not create an object though possible
            if (manual_type == value_t::array)
            {
                is_an_object = false;
            }

            // if object is wanted but impossible, throw an exception
            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
            {
                JSON_THROW(type_error::create(301, "cannot create object from initializer list", basic_json()));
            }
        }

        if (is_an_object)
        {
            // the initializer list is a list of pairs -> create object
            m_type = value_t::object;
            m_value = value_t::object;

            for (auto& element_ref : init)
            {
                auto element = element_ref.moved_or_copied();
                m_value.object->emplace(
                    std::move(*((*element.m_value.array)[0].m_value.string)),
                    std::move((*element.m_value.array)[1]));
            }
        }
        else
        {
            // the initializer list describes an array -> create array
            m_type = value_t::array;
            m_value.array = create<array_t>(init.begin(), init.end());
        }

        set_parents();
        assert_invariant();
    }

    /*!
    @brief explicitly create a binary array (without subtype)

    Creates a JSON binary array value from a given binary container. Binary
    values are part of various binary formats, such as CBOR, MessagePack, and
    BSON. This constructor is used to create a value for serialization to those
    formats.

    @note Note, this function exists because of the difficulty in correctly
    specifying the correct template overload in the standard value ctor, as both
    JSON arrays and JSON binary arrays are backed with some form of a
    `std::vector`. Because JSON binary arrays are a non-standard extension it
    was decided that it would be best to prevent automatic initialization of a
    binary array type, for backwards compatibility and so it does not happen on
    accident.

    @param[in] init container containing bytes to use as binary type

    @return JSON binary array value

    @complexity Linear in the size of @a init.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @since version 3.8.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(const typename binary_t::container_type& init)
    {
        auto res = basic_json();
        res.m_type = value_t::binary;
        res.m_value = init;
        return res;
    }

    /*!
    @brief explicitly create a binary array (with subtype)

    Creates a JSON binary array value from a given binary container. Binary
    values are part of various binary formats, such as CBOR, MessagePack, and
    BSON. This constructor is used to create a value for serialization to those
    formats.

    @note Note, this function exists because of the difficulty in correctly
    specifying the correct template overload in the standard value ctor, as both
    JSON arrays and JSON binary arrays are backed with some form of a
    `std::vector`. Because JSON binary arrays are a non-standard extension it
    was decided that it would be best to prevent automatic initialization of a
    binary array type, for backwards compatibility and so it does not happen on
    accident.

    @param[in] init container containing bytes to use as binary type
    @param[in] subtype subtype to use in MessagePack and BSON

    @return JSON binary array value

    @complexity Linear in the size of @a init.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @since version 3.8.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(const typename binary_t::container_type& init, std::uint8_t subtype)
    {
        auto res = basic_json();
        res.m_type = value_t::binary;
        res.m_value = binary_t(init, subtype);
        return res;
    }

    /// @copydoc binary(const typename binary_t::container_type&)
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(typename binary_t::container_type&& init)
    {
        auto res = basic_json();
        res.m_type = value_t::binary;
        res.m_value = std::move(init);
        return res;
    }

    /// @copydoc binary(const typename binary_t::container_type&, std::uint8_t)
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json binary(typename binary_t::container_type&& init, std::uint8_t subtype)
    {
        auto res = basic_json();
        res.m_type = value_t::binary;
        res.m_value = binary_t(std::move(init), subtype);
        return res;
    }

    /*!
    @brief explicitly create an array from an initializer list

    Creates a JSON array value from a given initializer list. That is, given a
    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
    initializer list is empty, the empty array `[]` is created.

    @note This function is only needed to express two edge cases that cannot
    be realized with the initializer list constructor (@ref
    basic_json(initializer_list_t, bool, value_t)). These cases
    are:
    1. creating an array whose elements are all pairs whose first element is a
    string -- in this case, the initializer list constructor would create an
    object, taking the first elements as keys
    2. creating an empty array -- passing the empty initializer list to the
    initializer list constructor yields an empty object

    @param[in] init  initializer list with JSON values to create an array from
    (optional)

    @return JSON array value

    @complexity Linear in the size of @a init.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The following code shows an example for the `array`
    function.,array}

    @sa see @ref basic_json(initializer_list_t, bool, value_t) --
    create a JSON value from an initializer list
    @sa see @ref object(initializer_list_t) -- create a JSON object
    value from an initializer list

    @since version 1.0.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json array(initializer_list_t init = {})
    {
        return basic_json(init, false, value_t::array);
    }

    /*!
    @brief explicitly create an object from an initializer list

    Creates a JSON object value from a given initializer list. The initializer
    lists elements must be pairs, and their first elements must be strings. If
    the initializer list is empty, the empty object `{}` is created.

    @note This function is only added for symmetry reasons. In contrast to the
    related function @ref array(initializer_list_t), there are
    no cases which can only be expressed by this function. That is, any
    initializer list @a init can also be passed to the initializer list
    constructor @ref basic_json(initializer_list_t, bool, value_t).

    @param[in] init  initializer list to create an object from (optional)

    @return JSON object value

    @throw type_error.301 if @a init is not a list of pairs whose first
    elements are strings. In this case, no object can be created. When such a
    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
    an array would have been created from the passed initializer list @a init.
    See example below.

    @complexity Linear in the size of @a init.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The following code shows an example for the `object`
    function.,object}

    @sa see @ref basic_json(initializer_list_t, bool, value_t) --
    create a JSON value from an initializer list
    @sa see @ref array(initializer_list_t) -- create a JSON array
    value from an initializer list

    @since version 1.0.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json object(initializer_list_t init = {})
    {
        return basic_json(init, false, value_t::object);
    }

    /*!
    @brief construct an array with count copies of given value

    Constructs a JSON array value by creating @a cnt copies of a passed value.
    In case @a cnt is `0`, an empty array is created.

    @param[in] cnt  the number of JSON copies of @a val to create
    @param[in] val  the JSON value to copy

    @post `std::distance(begin(),end()) == cnt` holds.

    @complexity Linear in @a cnt.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The following code shows examples for the @ref
    basic_json(size_type\, const basic_json&)
    constructor.,basic_json__size_type_basic_json}

    @since version 1.0.0
    */
    basic_json(size_type cnt, const basic_json& val)
        : m_type(value_t::array)
    {
        m_value.array = create<array_t>(cnt, val);
        set_parents();
        assert_invariant();
    }

    /*!
    @brief construct a JSON container given an iterator range

    Constructs the JSON value with the contents of the range `[first, last)`.
    The semantics depends on the different types a JSON value can have:
    - In case of a null type, invalid_iterator.206 is thrown.
    - In case of other primitive types (number, boolean, or string), @a first
      must be `begin()` and @a last must be `end()`. In this case, the value is
      copied. Otherwise, invalid_iterator.204 is thrown.
    - In case of structured types (array, object), the constructor behaves as
      similar versions for `std::vector` or `std::map`; that is, a JSON array
      or object is constructed from the values in the range.

    @tparam InputIT an input iterator type (@ref iterator or @ref
    const_iterator)

    @param[in] first begin of the range to copy from (included)
    @param[in] last end of the range to copy from (excluded)

    @pre Iterators @a first and @a last must be initialized. **This
         precondition is enforced with an assertion (see warning).** If
         assertions are switched off, a violation of this precondition yields
         undefined behavior.

    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
         checked efficiently. Only certain edge cases are detected; see the
         description of the exceptions below. A violation of this precondition
         yields undefined behavior.

    @warning A precondition is enforced with a runtime assertion that will
             result in calling `std::abort` if this precondition is not met.
             Assertions can be disabled by defining `NDEBUG` at compile time.
             See https://en.cppreference.com/w/cpp/error/assert for more
             information.

    @throw invalid_iterator.201 if iterators @a first and @a last are not
    compatible (i.e., do not belong to the same JSON value). In this case,
    the range `[first, last)` is undefined.
    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
    primitive type (number, boolean, or string), but @a first does not point
    to the first element any more. In this case, the range `[first, last)` is
    undefined. See example code below.
    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
    null value. In this case, the range `[first, last)` is undefined.

    @complexity Linear in distance between @a first and @a last.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @liveexample{The example below shows several ways to create JSON values by
    specifying a subrange with iterators.,basic_json__InputIt_InputIt}

    @since version 1.0.0
    */
    template < class InputIT, typename std::enable_if <
                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
    basic_json(InputIT first, InputIT last)
    {
        JSON_ASSERT(first.m_object != nullptr);
        JSON_ASSERT(last.m_object != nullptr);

        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", basic_json()));
        }

        // copy type from first iterator
        m_type = first.m_object->m_type;

        // check if iterator range is complete for primitive values
        switch (m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            {
                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
                                         || !last.m_it.primitive_iterator.is_end()))
                {
                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", *first.m_object));
                }
                break;
            }

            default:
                break;
        }

        switch (m_type)
        {
            case value_t::number_integer:
            {
                m_value.number_integer = first.m_object->m_value.number_integer;
                break;
            }

            case value_t::number_unsigned:
            {
                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
                break;
            }

            case value_t::number_float:
            {
                m_value.number_float = first.m_object->m_value.number_float;
                break;
            }

            case value_t::boolean:
            {
                m_value.boolean = first.m_object->m_value.boolean;
                break;
            }

            case value_t::string:
            {
                m_value = *first.m_object->m_value.string;
                break;
            }

            case value_t::object:
            {
                m_value.object = create<object_t>(first.m_it.object_iterator,
                                                  last.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                m_value.array = create<array_t>(first.m_it.array_iterator,
                                                last.m_it.array_iterator);
                break;
            }

            case value_t::binary:
            {
                m_value = *first.m_object->m_value.binary;
                break;
            }

            default:
                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " + std::string(first.m_object->type_name()), *first.m_object));
        }

        set_parents();
        assert_invariant();
    }


    ///////////////////////////////////////
    // other constructors and destructor //
    ///////////////////////////////////////

    template<typename JsonRef,
             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}

    /*!
    @brief copy constructor

    Creates a copy of a given JSON value.

    @param[in] other  the JSON value to copy

    @post `*this == other`

    @complexity Linear in the size of @a other.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes to any JSON value.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is linear.
    - As postcondition, it holds: `other == basic_json(other)`.

    @liveexample{The following code shows an example for the copy
    constructor.,basic_json__basic_json}

    @since version 1.0.0
    */
    basic_json(const basic_json& other)
        : m_type(other.m_type)
    {
        // check of passed value is valid
        other.assert_invariant();

        switch (m_type)
        {
            case value_t::object:
            {
                m_value = *other.m_value.object;
                break;
            }

            case value_t::array:
            {
                m_value = *other.m_value.array;
                break;
            }

            case value_t::string:
            {
                m_value = *other.m_value.string;
                break;
            }

            case value_t::boolean:
            {
                m_value = other.m_value.boolean;
                break;
            }

            case value_t::number_integer:
            {
                m_value = other.m_value.number_integer;
                break;
            }

            case value_t::number_unsigned:
            {
                m_value = other.m_value.number_unsigned;
                break;
            }

            case value_t::number_float:
            {
                m_value = other.m_value.number_float;
                break;
            }

            case value_t::binary:
            {
                m_value = *other.m_value.binary;
                break;
            }

            default:
                break;
        }

        set_parents();
        assert_invariant();
    }

    /*!
    @brief move constructor

    Move constructor. Constructs a JSON value with the contents of the given
    value @a other using move semantics. It "steals" the resources from @a
    other and leaves it as JSON null value.

    @param[in,out] other  value to move to this object

    @post `*this` has the same value as @a other before the call.
    @post @a other is a JSON null value.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this constructor never throws
    exceptions.

    @requirement This function helps `basic_json` satisfying the
    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
    requirements.

    @liveexample{The code below shows the move constructor explicitly called
    via std::move.,basic_json__moveconstructor}

    @since version 1.0.0
    */
    basic_json(basic_json&& other) noexcept
        : m_type(std::move(other.m_type)),
          m_value(std::move(other.m_value))
    {
        // check that passed value is valid
        other.assert_invariant(false);

        // invalidate payload
        other.m_type = value_t::null;
        other.m_value = {};

        set_parents();
        assert_invariant();
    }

    /*!
    @brief copy assignment

    Copy assignment operator. Copies a JSON value via the "copy and swap"
    strategy: It is expressed in terms of the copy constructor, destructor,
    and the `swap()` member function.

    @param[in] other  value to copy from

    @complexity Linear.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is linear.

    @liveexample{The code below shows and example for the copy assignment. It
    creates a copy of value `a` which is then swapped with `b`. Finally\, the
    copy of `a` (which is the null value after the swap) is
    destroyed.,basic_json__copyassignment}

    @since version 1.0.0
    */
    basic_json& operator=(basic_json other) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&&
        std::is_nothrow_move_assignable<json_value>::value
    )
    {
        // check that passed value is valid
        other.assert_invariant();

        using std::swap;
        swap(m_type, other.m_type);
        swap(m_value, other.m_value);

        set_parents();
        assert_invariant();
        return *this;
    }

    /*!
    @brief destructor

    Destroys the JSON value and frees all allocated memory.

    @complexity Linear.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is linear.
    - All stored elements are destroyed and all memory is freed.

    @since version 1.0.0
    */
    ~basic_json() noexcept
    {
        assert_invariant(false);
        m_value.destroy(m_type);
    }

    /// @}

  public:
    ///////////////////////
    // object inspection //
    ///////////////////////

    /// @name object inspection
    /// Functions to inspect the type of a JSON value.
    /// @{

    /*!
    @brief serialization

    Serialization function for JSON values. The function tries to mimic
    Python's `json.dumps()` function, and currently supports its @a indent
    and @a ensure_ascii parameters.

    @param[in] indent If indent is nonnegative, then array elements and object
    members will be pretty-printed with that indent level. An indent level of
    `0` will only insert newlines. `-1` (the default) selects the most compact
    representation.
    @param[in] indent_char The character to use for indentation if @a indent is
    greater than `0`. The default is ` ` (space).
    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
    in the output are escaped with `\uXXXX` sequences, and the result consists
    of ASCII characters only.
    @param[in] error_handler  how to react on decoding errors; there are three
    possible values: `strict` (throws and exception in case a decoding error
    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
    and `ignore` (ignore invalid UTF-8 sequences during serialization; all
    bytes are copied to the output unchanged).

    @return string containing the serialization of the JSON value

    @throw type_error.316 if a string stored inside the JSON value is not
                          UTF-8 encoded and @a error_handler is set to strict

    @note Binary values are serialized as object containing two keys:
      - "bytes": an array of bytes as integers
      - "subtype": the subtype as integer or "null" if the binary has no subtype

    @complexity Linear.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @liveexample{The following example shows the effect of different @a indent\,
    @a indent_char\, and @a ensure_ascii parameters to the result of the
    serialization.,dump}

    @see https://docs.python.org/2/library/json.html#json.dump

    @since version 1.0.0; indentation character @a indent_char, option
           @a ensure_ascii and exceptions added in version 3.0.0; error
           handlers added in version 3.4.0; serialization of binary values added
           in version 3.8.0.
    */
    string_t dump(const int indent = -1,
                  const char indent_char = ' ',
                  const bool ensure_ascii = false,
                  const error_handler_t error_handler = error_handler_t::strict) const
    {
        string_t result;
        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);

        if (indent >= 0)
        {
            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
        }
        else
        {
            s.dump(*this, false, ensure_ascii, 0);
        }

        return result;
    }

    /*!
    @brief return the type of the JSON value (explicit)

    Return the type of the JSON value as a value from the @ref value_t
    enumeration.

    @return the type of the JSON value
            Value type                | return value
            ------------------------- | -------------------------
            null                      | value_t::null
            boolean                   | value_t::boolean
            string                    | value_t::string
            number (integer)          | value_t::number_integer
            number (unsigned integer) | value_t::number_unsigned
            number (floating-point)   | value_t::number_float
            object                    | value_t::object
            array                     | value_t::array
            binary                    | value_t::binary
            discarded                 | value_t::discarded

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `type()` for all JSON
    types.,type}

    @sa see @ref operator value_t() -- return the type of the JSON value (implicit)
    @sa see @ref type_name() -- return the type as string

    @since version 1.0.0
    */
    constexpr value_t type() const noexcept
    {
        return m_type;
    }

    /*!
    @brief return whether type is primitive

    This function returns true if and only if the JSON type is primitive
    (string, number, boolean, or null).

    @return `true` if type is primitive (string, number, boolean, or null),
    `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_primitive()` for all JSON
    types.,is_primitive}

    @sa see @ref is_structured() -- returns whether JSON value is structured
    @sa see @ref is_null() -- returns whether JSON value is `null`
    @sa see @ref is_string() -- returns whether JSON value is a string
    @sa see @ref is_boolean() -- returns whether JSON value is a boolean
    @sa see @ref is_number() -- returns whether JSON value is a number
    @sa see @ref is_binary() -- returns whether JSON value is a binary array

    @since version 1.0.0
    */
    constexpr bool is_primitive() const noexcept
    {
        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
    }

    /*!
    @brief return whether type is structured

    This function returns true if and only if the JSON type is structured
    (array or object).

    @return `true` if type is structured (array or object), `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_structured()` for all JSON
    types.,is_structured}

    @sa see @ref is_primitive() -- returns whether value is primitive
    @sa see @ref is_array() -- returns whether value is an array
    @sa see @ref is_object() -- returns whether value is an object

    @since version 1.0.0
    */
    constexpr bool is_structured() const noexcept
    {
        return is_array() || is_object();
    }

    /*!
    @brief return whether value is null

    This function returns true if and only if the JSON value is null.

    @return `true` if type is null, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_null()` for all JSON
    types.,is_null}

    @since version 1.0.0
    */
    constexpr bool is_null() const noexcept
    {
        return m_type == value_t::null;
    }

    /*!
    @brief return whether value is a boolean

    This function returns true if and only if the JSON value is a boolean.

    @return `true` if type is boolean, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_boolean()` for all JSON
    types.,is_boolean}

    @since version 1.0.0
    */
    constexpr bool is_boolean() const noexcept
    {
        return m_type == value_t::boolean;
    }

    /*!
    @brief return whether value is a number

    This function returns true if and only if the JSON value is a number. This
    includes both integer (signed and unsigned) and floating-point values.

    @return `true` if type is number (regardless whether integer, unsigned
    integer or floating-type), `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_number()` for all JSON
    types.,is_number}

    @sa see @ref is_number_integer() -- check if value is an integer or unsigned
    integer number
    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
    number
    @sa see @ref is_number_float() -- check if value is a floating-point number

    @since version 1.0.0
    */
    constexpr bool is_number() const noexcept
    {
        return is_number_integer() || is_number_float();
    }

    /*!
    @brief return whether value is an integer number

    This function returns true if and only if the JSON value is a signed or
    unsigned integer number. This excludes floating-point values.

    @return `true` if type is an integer or unsigned integer number, `false`
    otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_number_integer()` for all
    JSON types.,is_number_integer}

    @sa see @ref is_number() -- check if value is a number
    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
    number
    @sa see @ref is_number_float() -- check if value is a floating-point number

    @since version 1.0.0
    */
    constexpr bool is_number_integer() const noexcept
    {
        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
    }

    /*!
    @brief return whether value is an unsigned integer number

    This function returns true if and only if the JSON value is an unsigned
    integer number. This excludes floating-point and signed integer values.

    @return `true` if type is an unsigned integer number, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_number_unsigned()` for all
    JSON types.,is_number_unsigned}

    @sa see @ref is_number() -- check if value is a number
    @sa see @ref is_number_integer() -- check if value is an integer or unsigned
    integer number
    @sa see @ref is_number_float() -- check if value is a floating-point number

    @since version 2.0.0
    */
    constexpr bool is_number_unsigned() const noexcept
    {
        return m_type == value_t::number_unsigned;
    }

    /*!
    @brief return whether value is a floating-point number

    This function returns true if and only if the JSON value is a
    floating-point number. This excludes signed and unsigned integer values.

    @return `true` if type is a floating-point number, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_number_float()` for all
    JSON types.,is_number_float}

    @sa see @ref is_number() -- check if value is number
    @sa see @ref is_number_integer() -- check if value is an integer number
    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
    number

    @since version 1.0.0
    */
    constexpr bool is_number_float() const noexcept
    {
        return m_type == value_t::number_float;
    }

    /*!
    @brief return whether value is an object

    This function returns true if and only if the JSON value is an object.

    @return `true` if type is object, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_object()` for all JSON
    types.,is_object}

    @since version 1.0.0
    */
    constexpr bool is_object() const noexcept
    {
        return m_type == value_t::object;
    }

    /*!
    @brief return whether value is an array

    This function returns true if and only if the JSON value is an array.

    @return `true` if type is array, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_array()` for all JSON
    types.,is_array}

    @since version 1.0.0
    */
    constexpr bool is_array() const noexcept
    {
        return m_type == value_t::array;
    }

    /*!
    @brief return whether value is a string

    This function returns true if and only if the JSON value is a string.

    @return `true` if type is string, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_string()` for all JSON
    types.,is_string}

    @since version 1.0.0
    */
    constexpr bool is_string() const noexcept
    {
        return m_type == value_t::string;
    }

    /*!
    @brief return whether value is a binary array

    This function returns true if and only if the JSON value is a binary array.

    @return `true` if type is binary array, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_binary()` for all JSON
    types.,is_binary}

    @since version 3.8.0
    */
    constexpr bool is_binary() const noexcept
    {
        return m_type == value_t::binary;
    }

    /*!
    @brief return whether value is discarded

    This function returns true if and only if the JSON value was discarded
    during parsing with a callback function (see @ref parser_callback_t).

    @note This function will always be `false` for JSON values after parsing.
    That is, discarded values can only occur during parsing, but will be
    removed when inside a structured value or replaced by null in other cases.

    @return `true` if type is discarded, `false` otherwise.

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies `is_discarded()` for all JSON
    types.,is_discarded}

    @since version 1.0.0
    */
    constexpr bool is_discarded() const noexcept
    {
        return m_type == value_t::discarded;
    }

    /*!
    @brief return the type of the JSON value (implicit)

    Implicitly return the type of the JSON value as a value from the @ref
    value_t enumeration.

    @return the type of the JSON value

    @complexity Constant.

    @exceptionsafety No-throw guarantee: this member function never throws
    exceptions.

    @liveexample{The following code exemplifies the @ref value_t operator for
    all JSON types.,operator__value_t}

    @sa see @ref type() -- return the type of the JSON value (explicit)
    @sa see @ref type_name() -- return the type as string

    @since version 1.0.0
    */
    constexpr operator value_t() const noexcept
    {
        return m_type;
    }

    /// @}

  private:
    //////////////////
    // value access //
    //////////////////

    /// get a boolean (explicit)
    boolean_t get_impl(boolean_t* /*unused*/) const
    {
        if (JSON_HEDLEY_LIKELY(is_boolean()))
        {
            return m_value.boolean;
        }

        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name()), *this));
    }

    /// get a pointer to the value (object)
    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
    {
        return is_object() ? m_value.object : nullptr;
    }

    /// get a pointer to the value (object)
    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
    {
        return is_object() ? m_value.object : nullptr;
    }

    /// get a pointer to the value (array)
    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
    {
        return is_array() ? m_value.array : nullptr;
    }

    /// get a pointer to the value (array)
    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
    {
        return is_array() ? m_value.array : nullptr;
    }

    /// get a pointer to the value (string)
    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
    {
        return is_string() ? m_value.string : nullptr;
    }

    /// get a pointer to the value (string)
    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
    {
        return is_string() ? m_value.string : nullptr;
    }

    /// get a pointer to the value (boolean)
    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
    {
        return is_boolean() ? &m_value.boolean : nullptr;
    }

    /// get a pointer to the value (boolean)
    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
    {
        return is_boolean() ? &m_value.boolean : nullptr;
    }

    /// get a pointer to the value (integer number)
    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
    {
        return is_number_integer() ? &m_value.number_integer : nullptr;
    }

    /// get a pointer to the value (integer number)
    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
    {
        return is_number_integer() ? &m_value.number_integer : nullptr;
    }

    /// get a pointer to the value (unsigned number)
    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
    {
        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
    }

    /// get a pointer to the value (unsigned number)
    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
    {
        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
    }

    /// get a pointer to the value (floating-point number)
    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
    {
        return is_number_float() ? &m_value.number_float : nullptr;
    }

    /// get a pointer to the value (floating-point number)
    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
    {
        return is_number_float() ? &m_value.number_float : nullptr;
    }

    /// get a pointer to the value (binary)
    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
    {
        return is_binary() ? m_value.binary : nullptr;
    }

    /// get a pointer to the value (binary)
    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
    {
        return is_binary() ? m_value.binary : nullptr;
    }

    /*!
    @brief helper function to implement get_ref()

    This function helps to implement get_ref() without code duplication for
    const and non-const overloads

    @tparam ThisType will be deduced as `basic_json` or `const basic_json`

    @throw type_error.303 if ReferenceType does not match underlying value
    type of the current JSON
    */
    template<typename ReferenceType, typename ThisType>
    static ReferenceType get_ref_impl(ThisType& obj)
    {
        // delegate the call to get_ptr<>()
        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();

        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
        {
            return *ptr;
        }

        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name()), obj));
    }

  public:
    /// @name value access
    /// Direct access to the stored value of a JSON value.
    /// @{

    /*!
    @brief get special-case overload

    This overloads avoids a lot of template boilerplate, it can be seen as the
    identity method

    @tparam BasicJsonType == @ref basic_json

    @return a copy of *this

    @complexity Constant.

    @since version 2.1.0
    */
    template<typename BasicJsonType, detail::enable_if_t<
                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
                 int> = 0>
    basic_json get() const
    {
        return *this;
    }

    /*!
    @brief get special-case overload

    This overloads converts the current @ref basic_json in a different
    @ref basic_json type

    @tparam BasicJsonType == @ref basic_json

    @return a copy of *this, converted into @a BasicJsonType

    @complexity Depending on the implementation of the called `from_json()`
                method.

    @since version 3.2.0
    */
    template < typename BasicJsonType, detail::enable_if_t <
                   !std::is_same<BasicJsonType, basic_json>::value&&
                   detail::is_basic_json<BasicJsonType>::value, int > = 0 >
    BasicJsonType get() const
    {
        return *this;
    }

    /*!
    @brief get a value (explicit)

    Explicit type conversion between the JSON value and a compatible value
    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
    The value is converted by calling the @ref json_serializer<ValueType>
    `from_json()` method.

    The function is equivalent to executing
    @code {.cpp}
    ValueType ret;
    JSONSerializer<ValueType>::from_json(*this, ret);
    return ret;
    @endcode

    This overloads is chosen if:
    - @a ValueType is not @ref basic_json,
    - @ref json_serializer<ValueType> has a `from_json()` method of the form
      `void from_json(const basic_json&, ValueType&)`, and
    - @ref json_serializer<ValueType> does not have a `from_json()` method of
      the form `ValueType from_json(const basic_json&)`

    @tparam ValueTypeCV the provided value type
    @tparam ValueType the returned value type

    @return copy of the JSON value, converted to @a ValueType

    @throw what @ref json_serializer<ValueType> `from_json()` method throws

    @liveexample{The example below shows several conversions from JSON values
    to other types. There a few things to note: (1) Floating-point numbers can
    be converted to integers\, (2) A JSON array can be converted to a standard
    `std::vector<short>`\, (3) A JSON object can be converted to C++
    associative containers such as `std::unordered_map<std::string\,
    json>`.,get__ValueType_const}

    @since version 2.1.0
    */
    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
               detail::enable_if_t <
                   !detail::is_basic_json<ValueType>::value &&
                   detail::has_from_json<basic_json_t, ValueType>::value &&
                   !detail::has_non_default_from_json<basic_json_t, ValueType>::value,
                   int > = 0 >
    ValueType get() const noexcept(noexcept(
                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
    {
        // we cannot static_assert on ValueTypeCV being non-const, because
        // there is support for get<const basic_json_t>(), which is why we
        // still need the uncvref
        static_assert(!std::is_reference<ValueTypeCV>::value,
                      "get() cannot be used with reference types, you might want to use get_ref()");
        static_assert(std::is_default_constructible<ValueType>::value,
                      "types must be DefaultConstructible when used with get()");

        ValueType ret{};
        JSONSerializer<ValueType>::from_json(*this, ret);
        return ret;
    }

    /*!
    @brief get a value (explicit); special case

    Explicit type conversion between the JSON value and a compatible value
    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
    The value is converted by calling the @ref json_serializer<ValueType>
    `from_json()` method.

    The function is equivalent to executing
    @code {.cpp}
    return JSONSerializer<ValueTypeCV>::from_json(*this);
    @endcode

    This overloads is chosen if:
    - @a ValueType is not @ref basic_json and
    - @ref json_serializer<ValueType> has a `from_json()` method of the form
      `ValueType from_json(const basic_json&)`

    @note If @ref json_serializer<ValueType> has both overloads of
    `from_json()`, this one is chosen.

    @tparam ValueTypeCV the provided value type
    @tparam ValueType the returned value type

    @return copy of the JSON value, converted to @a ValueType

    @throw what @ref json_serializer<ValueType> `from_json()` method throws

    @since version 2.1.0
    */
    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
               detail::enable_if_t < !std::is_same<basic_json_t, ValueType>::value &&
                                     detail::has_non_default_from_json<basic_json_t, ValueType>::value,
                                     int > = 0 >
    ValueType get() const noexcept(noexcept(
                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
    {
        static_assert(!std::is_reference<ValueTypeCV>::value,
                      "get() cannot be used with reference types, you might want to use get_ref()");
        return JSONSerializer<ValueType>::from_json(*this);
    }

    /*!
    @brief get a value (explicit)

    Explicit type conversion between the JSON value and a compatible value.
    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
    `from_json()` method.

    The function is equivalent to executing
    @code {.cpp}
    ValueType v;
    JSONSerializer<ValueType>::from_json(*this, v);
    @endcode

    This overloads is chosen if:
    - @a ValueType is not @ref basic_json,
    - @ref json_serializer<ValueType> has a `from_json()` method of the form
      `void from_json(const basic_json&, ValueType&)`, and

    @tparam ValueType the input parameter type.

    @return the input parameter, allowing chaining calls.

    @throw what @ref json_serializer<ValueType> `from_json()` method throws

    @liveexample{The example below shows several conversions from JSON values
    to other types. There a few things to note: (1) Floating-point numbers can
    be converted to integers\, (2) A JSON array can be converted to a standard
    `std::vector<short>`\, (3) A JSON object can be converted to C++
    associative containers such as `std::unordered_map<std::string\,
    json>`.,get_to}

    @since version 3.3.0
    */
    template < typename ValueType,
               detail::enable_if_t <
                   !detail::is_basic_json<ValueType>::value&&
                   detail::has_from_json<basic_json_t, ValueType>::value,
                   int > = 0 >
    ValueType & get_to(ValueType& v) const noexcept(noexcept(
                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
    {
        JSONSerializer<ValueType>::from_json(*this, v);
        return v;
    }

    // specialization to allow to call get_to with a basic_json value
    // see https://github.com/nlohmann/json/issues/2175
    template<typename ValueType,
             detail::enable_if_t <
                 detail::is_basic_json<ValueType>::value,
                 int> = 0>
    ValueType & get_to(ValueType& v) const
    {
        v = *this;
        return v;
    }

    template <
        typename T, std::size_t N,
        typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
        detail::enable_if_t <
            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
    Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
    noexcept(noexcept(JSONSerializer<Array>::from_json(
                          std::declval<const basic_json_t&>(), v)))
    {
        JSONSerializer<Array>::from_json(*this, v);
        return v;
    }


    /*!
    @brief get a pointer value (implicit)

    Implicit pointer access to the internally stored JSON value. No copies are
    made.

    @warning Writing data to the pointee of the result yields an undefined
    state.

    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
    assertion.

    @return pointer to the internally stored JSON value if the requested
    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise

    @complexity Constant.

    @liveexample{The example below shows how pointers to internal values of a
    JSON value can be requested. Note that no type conversions are made and a
    `nullptr` is returned if the value and the requested pointer type does not
    match.,get_ptr}

    @since version 1.0.0
    */
    template<typename PointerType, typename std::enable_if<
                 std::is_pointer<PointerType>::value, int>::type = 0>
    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
    {
        // delegate the call to get_impl_ptr<>()
        return get_impl_ptr(static_cast<PointerType>(nullptr));
    }

    /*!
    @brief get a pointer value (implicit)
    @copydoc get_ptr()
    */
    template < typename PointerType, typename std::enable_if <
                   std::is_pointer<PointerType>::value&&
                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
    {
        // delegate the call to get_impl_ptr<>() const
        return get_impl_ptr(static_cast<PointerType>(nullptr));
    }

    /*!
    @brief get a pointer value (explicit)

    Explicit pointer access to the internally stored JSON value. No copies are
    made.

    @warning The pointer becomes invalid if the underlying JSON object
    changes.

    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
    @ref number_unsigned_t, or @ref number_float_t.

    @return pointer to the internally stored JSON value if the requested
    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise

    @complexity Constant.

    @liveexample{The example below shows how pointers to internal values of a
    JSON value can be requested. Note that no type conversions are made and a
    `nullptr` is returned if the value and the requested pointer type does not
    match.,get__PointerType}

    @sa see @ref get_ptr() for explicit pointer-member access

    @since version 1.0.0
    */
    template<typename PointerType, typename std::enable_if<
                 std::is_pointer<PointerType>::value, int>::type = 0>
    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
    {
        // delegate the call to get_ptr
        return get_ptr<PointerType>();
    }

    /*!
    @brief get a pointer value (explicit)
    @copydoc get()
    */
    template<typename PointerType, typename std::enable_if<
                 std::is_pointer<PointerType>::value, int>::type = 0>
    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
    {
        // delegate the call to get_ptr
        return get_ptr<PointerType>();
    }

    /*!
    @brief get a reference value (implicit)

    Implicit reference access to the internally stored JSON value. No copies
    are made.

    @warning Writing data to the referee of the result yields an undefined
    state.

    @tparam ReferenceType reference type; must be a reference to @ref array_t,
    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
    @ref number_float_t. Enforced by static assertion.

    @return reference to the internally stored JSON value if the requested
    reference type @a ReferenceType fits to the JSON value; throws
    type_error.303 otherwise

    @throw type_error.303 in case passed type @a ReferenceType is incompatible
    with the stored JSON value; see example below

    @complexity Constant.

    @liveexample{The example shows several calls to `get_ref()`.,get_ref}

    @since version 1.1.0
    */
    template<typename ReferenceType, typename std::enable_if<
                 std::is_reference<ReferenceType>::value, int>::type = 0>
    ReferenceType get_ref()
    {
        // delegate call to get_ref_impl
        return get_ref_impl<ReferenceType>(*this);
    }

    /*!
    @brief get a reference value (implicit)
    @copydoc get_ref()
    */
    template < typename ReferenceType, typename std::enable_if <
                   std::is_reference<ReferenceType>::value&&
                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
    ReferenceType get_ref() const
    {
        // delegate call to get_ref_impl
        return get_ref_impl<ReferenceType>(*this);
    }

    /*!
    @brief get a value (implicit)

    Implicit type conversion between the JSON value and a compatible value.
    The call is realized by calling @ref get() const.

    @tparam ValueType non-pointer type compatible to the JSON value, for
    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
    `std::vector` types for JSON arrays. The character type of @ref string_t
    as well as an initializer list of this type is excluded to avoid
    ambiguities as these types implicitly convert to `std::string`.

    @return copy of the JSON value, converted to type @a ValueType

    @throw type_error.302 in case passed type @a ValueType is incompatible
    to the JSON value type (e.g., the JSON value is of type boolean, but a
    string is requested); see example below

    @complexity Linear in the size of the JSON value.

    @liveexample{The example below shows several conversions from JSON values
    to other types. There a few things to note: (1) Floating-point numbers can
    be converted to integers\, (2) A JSON array can be converted to a standard
    `std::vector<short>`\, (3) A JSON object can be converted to C++
    associative containers such as `std::unordered_map<std::string\,
    json>`.,operator__ValueType}

    @since version 1.0.0
    */
    template < typename ValueType, typename std::enable_if <
                   !std::is_pointer<ValueType>::value&&
                   !std::is_same<ValueType, detail::json_ref<basic_json>>::value&&
                   !std::is_same<ValueType, typename string_t::value_type>::value&&
                   !detail::is_basic_json<ValueType>::value
                   && !std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
                   && !std::is_same<ValueType, typename std::string_view>::value
#endif
                   && detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
                   , int >::type = 0 >
    JSON_EXPLICIT operator ValueType() const
    {
        // delegate the call to get<>() const
        return get<ValueType>();
    }

    /*!
    @return reference to the binary value

    @throw type_error.302 if the value is not binary

    @sa see @ref is_binary() to check if the value is binary

    @since version 3.8.0
    */
    binary_t& get_binary()
    {
        if (!is_binary())
        {
            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this));
        }

        return *get_ptr<binary_t*>();
    }

    /// @copydoc get_binary()
    const binary_t& get_binary() const
    {
        if (!is_binary())
        {
            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this));
        }

        return *get_ptr<const binary_t*>();
    }

    /// @}


    ////////////////////
    // element access //
    ////////////////////

    /// @name element access
    /// Access to the JSON value.
    /// @{

    /*!
    @brief access specified array element with bounds checking

    Returns a reference to the element at specified location @a idx, with
    bounds checking.

    @param[in] idx  index of the element to access

    @return reference to the element at index @a idx

    @throw type_error.304 if the JSON value is not an array; in this case,
    calling `at` with an index makes no sense. See example below.
    @throw out_of_range.401 if the index @a idx is out of range of the array;
    that is, `idx >= size()`. See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @since version 1.0.0

    @liveexample{The example below shows how array elements can be read and
    written using `at()`. It also demonstrates the different exceptions that
    can be thrown.,at__size_type}
    */
    reference at(size_type idx)
    {
        // at only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            JSON_TRY
            {
                return set_parent(m_value.array->at(idx));
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief access specified array element with bounds checking

    Returns a const reference to the element at specified location @a idx,
    with bounds checking.

    @param[in] idx  index of the element to access

    @return const reference to the element at index @a idx

    @throw type_error.304 if the JSON value is not an array; in this case,
    calling `at` with an index makes no sense. See example below.
    @throw out_of_range.401 if the index @a idx is out of range of the array;
    that is, `idx >= size()`. See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @since version 1.0.0

    @liveexample{The example below shows how array elements can be read using
    `at()`. It also demonstrates the different exceptions that can be thrown.,
    at__size_type_const}
    */
    const_reference at(size_type idx) const
    {
        // at only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            JSON_TRY
            {
                return m_value.array->at(idx);
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief access specified object element with bounds checking

    Returns a reference to the element at with specified key @a key, with
    bounds checking.

    @param[in] key  key of the element to access

    @return reference to the element at key @a key

    @throw type_error.304 if the JSON value is not an object; in this case,
    calling `at` with a key makes no sense. See example below.
    @throw out_of_range.403 if the key @a key is is not stored in the object;
    that is, `find(key) == end()`. See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Logarithmic in the size of the container.

    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
    access by reference
    @sa see @ref value() for access by value with a default value

    @since version 1.0.0

    @liveexample{The example below shows how object elements can be read and
    written using `at()`. It also demonstrates the different exceptions that
    can be thrown.,at__object_t_key_type}
    */
    reference at(const typename object_t::key_type& key)
    {
        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            JSON_TRY
            {
                return set_parent(m_value.object->at(key));
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief access specified object element with bounds checking

    Returns a const reference to the element at with specified key @a key,
    with bounds checking.

    @param[in] key  key of the element to access

    @return const reference to the element at key @a key

    @throw type_error.304 if the JSON value is not an object; in this case,
    calling `at` with a key makes no sense. See example below.
    @throw out_of_range.403 if the key @a key is is not stored in the object;
    that is, `find(key) == end()`. See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Logarithmic in the size of the container.

    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
    access by reference
    @sa see @ref value() for access by value with a default value

    @since version 1.0.0

    @liveexample{The example below shows how object elements can be read using
    `at()`. It also demonstrates the different exceptions that can be thrown.,
    at__object_t_key_type_const}
    */
    const_reference at(const typename object_t::key_type& key) const
    {
        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            JSON_TRY
            {
                return m_value.object->at(key);
            }
            JSON_CATCH (std::out_of_range&)
            {
                // create better exception explanation
                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this));
            }
        }
        else
        {
            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief access specified array element

    Returns a reference to the element at specified location @a idx.

    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
    then the array is silently filled up with `null` values to make `idx` a
    valid reference to the last stored element.

    @param[in] idx  index of the element to access

    @return reference to the element at index @a idx

    @throw type_error.305 if the JSON value is not an array or null; in that
    cases, using the [] operator with an index makes no sense.

    @complexity Constant if @a idx is in the range of the array. Otherwise
    linear in `idx - size()`.

    @liveexample{The example below shows how array elements can be read and
    written using `[]` operator. Note the addition of `null`
    values.,operatorarray__size_type}

    @since version 1.0.0
    */
    reference operator[](size_type idx)
    {
        // implicitly convert null value to an empty array
        if (is_null())
        {
            m_type = value_t::array;
            m_value.array = create<array_t>();
            assert_invariant();
        }

        // operator[] only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // fill up array with null values if given idx is outside range
            if (idx >= m_value.array->size())
            {
#if JSON_DIAGNOSTICS
                // remember array size before resizing
                const auto previous_size = m_value.array->size();
#endif
                m_value.array->resize(idx + 1);

#if JSON_DIAGNOSTICS
                // set parent for values added above
                set_parents(begin() + static_cast<typename iterator::difference_type>(previous_size), static_cast<typename iterator::difference_type>(idx + 1 - previous_size));
#endif
            }

            return m_value.array->operator[](idx);
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief access specified array element

    Returns a const reference to the element at specified location @a idx.

    @param[in] idx  index of the element to access

    @return const reference to the element at index @a idx

    @throw type_error.305 if the JSON value is not an array; in that case,
    using the [] operator with an index makes no sense.

    @complexity Constant.

    @liveexample{The example below shows how array elements can be read using
    the `[]` operator.,operatorarray__size_type_const}

    @since version 1.0.0
    */
    const_reference operator[](size_type idx) const
    {
        // const operator[] only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            return m_value.array->operator[](idx);
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief access specified object element

    Returns a reference to the element at with specified key @a key.

    @note If @a key is not found in the object, then it is silently added to
    the object and filled with a `null` value to make `key` a valid reference.
    In case the value was `null` before, it is converted to an object.

    @param[in] key  key of the element to access

    @return reference to the element at key @a key

    @throw type_error.305 if the JSON value is not an object or null; in that
    cases, using the [] operator with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be read and
    written using the `[]` operator.,operatorarray__key_type}

    @sa see @ref at(const typename object_t::key_type&) for access by reference
    with range checking
    @sa see @ref value() for access by value with a default value

    @since version 1.0.0
    */
    reference operator[](const typename object_t::key_type& key)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_type = value_t::object;
            m_value.object = create<object_t>();
            assert_invariant();
        }

        // operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            return set_parent(m_value.object->operator[](key));
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief read-only access specified object element

    Returns a const reference to the element at with specified key @a key. No
    bounds checking is performed.

    @warning If the element with key @a key does not exist, the behavior is
    undefined.

    @param[in] key  key of the element to access

    @return const reference to the element at key @a key

    @pre The element with key @a key must exist. **This precondition is
         enforced with an assertion.**

    @throw type_error.305 if the JSON value is not an object; in that case,
    using the [] operator with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be read using
    the `[]` operator.,operatorarray__key_type_const}

    @sa see @ref at(const typename object_t::key_type&) for access by reference
    with range checking
    @sa see @ref value() for access by value with a default value

    @since version 1.0.0
    */
    const_reference operator[](const typename object_t::key_type& key) const
    {
        // const operator[] only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
            return m_value.object->find(key)->second;
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief access specified object element

    Returns a reference to the element at with specified key @a key.

    @note If @a key is not found in the object, then it is silently added to
    the object and filled with a `null` value to make `key` a valid reference.
    In case the value was `null` before, it is converted to an object.

    @param[in] key  key of the element to access

    @return reference to the element at key @a key

    @throw type_error.305 if the JSON value is not an object or null; in that
    cases, using the [] operator with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be read and
    written using the `[]` operator.,operatorarray__key_type}

    @sa see @ref at(const typename object_t::key_type&) for access by reference
    with range checking
    @sa see @ref value() for access by value with a default value

    @since version 1.1.0
    */
    template<typename T>
    JSON_HEDLEY_NON_NULL(2)
    reference operator[](T* key)
    {
        // implicitly convert null to object
        if (is_null())
        {
            m_type = value_t::object;
            m_value = value_t::object;
            assert_invariant();
        }

        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            return set_parent(m_value.object->operator[](key));
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief read-only access specified object element

    Returns a const reference to the element at with specified key @a key. No
    bounds checking is performed.

    @warning If the element with key @a key does not exist, the behavior is
    undefined.

    @param[in] key  key of the element to access

    @return const reference to the element at key @a key

    @pre The element with key @a key must exist. **This precondition is
         enforced with an assertion.**

    @throw type_error.305 if the JSON value is not an object; in that case,
    using the [] operator with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be read using
    the `[]` operator.,operatorarray__key_type_const}

    @sa see @ref at(const typename object_t::key_type&) for access by reference
    with range checking
    @sa see @ref value() for access by value with a default value

    @since version 1.1.0
    */
    template<typename T>
    JSON_HEDLEY_NON_NULL(2)
    const_reference operator[](T* key) const
    {
        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
            return m_value.object->find(key)->second;
        }

        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
    }

    /*!
    @brief access specified object element with default value

    Returns either a copy of an object's element at the specified key @a key
    or a given default value if no element with key @a key exists.

    The function is basically equivalent to executing
    @code {.cpp}
    try {
        return at(key);
    } catch(out_of_range) {
        return default_value;
    }
    @endcode

    @note Unlike @ref at(const typename object_t::key_type&), this function
    does not throw if the given key @a key was not found.

    @note Unlike @ref operator[](const typename object_t::key_type& key), this
    function does not implicitly add an element to the position defined by @a
    key. This function is furthermore also applicable to const objects.

    @param[in] key  key of the element to access
    @param[in] default_value  the value to return if @a key is not found

    @tparam ValueType type compatible to JSON values, for instance `int` for
    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
    JSON arrays. Note the type of the expected value at @a key and the default
    value @a default_value must be compatible.

    @return copy of the element at key @a key or @a default_value if @a key
    is not found

    @throw type_error.302 if @a default_value does not match the type of the
    value at @a key
    @throw type_error.306 if the JSON value is not an object; in that case,
    using `value()` with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be queried
    with a default value.,basic_json__value}

    @sa see @ref at(const typename object_t::key_type&) for access by reference
    with range checking
    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
    access by reference

    @since version 1.0.0
    */
    // using std::is_convertible in a std::enable_if will fail when using explicit conversions
    template < class ValueType, typename std::enable_if <
                   detail::is_getable<basic_json_t, ValueType>::value
                   && !std::is_same<value_t, ValueType>::value, int >::type = 0 >
    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
    {
        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if key is found, return value and given default value otherwise
            const auto it = find(key);
            if (it != end())
            {
                return it->template get<ValueType>();
            }

            return default_value;
        }

        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this));
    }

    /*!
    @brief overload for a default value of type const char*
    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
    */
    string_t value(const typename object_t::key_type& key, const char* default_value) const
    {
        return value(key, string_t(default_value));
    }

    /*!
    @brief access specified object element via JSON Pointer with default value

    Returns either a copy of an object's element at the specified key @a key
    or a given default value if no element with key @a key exists.

    The function is basically equivalent to executing
    @code {.cpp}
    try {
        return at(ptr);
    } catch(out_of_range) {
        return default_value;
    }
    @endcode

    @note Unlike @ref at(const json_pointer&), this function does not throw
    if the given key @a key was not found.

    @param[in] ptr  a JSON pointer to the element to access
    @param[in] default_value  the value to return if @a ptr found no value

    @tparam ValueType type compatible to JSON values, for instance `int` for
    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
    JSON arrays. Note the type of the expected value at @a key and the default
    value @a default_value must be compatible.

    @return copy of the element at key @a key or @a default_value if @a key
    is not found

    @throw type_error.302 if @a default_value does not match the type of the
    value at @a ptr
    @throw type_error.306 if the JSON value is not an object; in that case,
    using `value()` with a key makes no sense.

    @complexity Logarithmic in the size of the container.

    @liveexample{The example below shows how object elements can be queried
    with a default value.,basic_json__value_ptr}

    @sa see @ref operator[](const json_pointer&) for unchecked access by reference

    @since version 2.0.2
    */
    template<class ValueType, typename std::enable_if<
                 detail::is_getable<basic_json_t, ValueType>::value, int>::type = 0>
    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
    {
        // at only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            // if pointer resolves a value, return it or use default value
            JSON_TRY
            {
                return ptr.get_checked(this).template get<ValueType>();
            }
            JSON_INTERNAL_CATCH (out_of_range&)
            {
                return default_value;
            }
        }

        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this));
    }

    /*!
    @brief overload for a default value of type const char*
    @copydoc basic_json::value(const json_pointer&, ValueType) const
    */
    JSON_HEDLEY_NON_NULL(3)
    string_t value(const json_pointer& ptr, const char* default_value) const
    {
        return value(ptr, string_t(default_value));
    }

    /*!
    @brief access the first element

    Returns a reference to the first element in the container. For a JSON
    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.

    @return In case of a structured type (array or object), a reference to the
    first element is returned. In case of number, string, boolean, or binary
    values, a reference to the value is returned.

    @complexity Constant.

    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
    or an empty array or object (undefined behavior, **guarded by
    assertions**).
    @post The JSON value remains unchanged.

    @throw invalid_iterator.214 when called on `null` value

    @liveexample{The following code shows an example for `front()`.,front}

    @sa see @ref back() -- access the last element

    @since version 1.0.0
    */
    reference front()
    {
        return *begin();
    }

    /*!
    @copydoc basic_json::front()
    */
    const_reference front() const
    {
        return *cbegin();
    }

    /*!
    @brief access the last element

    Returns a reference to the last element in the container. For a JSON
    container `c`, the expression `c.back()` is equivalent to
    @code {.cpp}
    auto tmp = c.end();
    --tmp;
    return *tmp;
    @endcode

    @return In case of a structured type (array or object), a reference to the
    last element is returned. In case of number, string, boolean, or binary
    values, a reference to the value is returned.

    @complexity Constant.

    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
    or an empty array or object (undefined behavior, **guarded by
    assertions**).
    @post The JSON value remains unchanged.

    @throw invalid_iterator.214 when called on a `null` value. See example
    below.

    @liveexample{The following code shows an example for `back()`.,back}

    @sa see @ref front() -- access the first element

    @since version 1.0.0
    */
    reference back()
    {
        auto tmp = end();
        --tmp;
        return *tmp;
    }

    /*!
    @copydoc basic_json::back()
    */
    const_reference back() const
    {
        auto tmp = cend();
        --tmp;
        return *tmp;
    }

    /*!
    @brief remove element given an iterator

    Removes the element specified by iterator @a pos. The iterator @a pos must
    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
    but is not dereferenceable) cannot be used as a value for @a pos.

    If called on a primitive type other than `null`, the resulting JSON value
    will be `null`.

    @param[in] pos iterator to the element to remove
    @return Iterator following the last removed element. If the iterator @a
    pos refers to the last element, the `end()` iterator is returned.

    @tparam IteratorType an @ref iterator or @ref const_iterator

    @post Invalidates iterators and references at or after the point of the
    erase, including the `end()` iterator.

    @throw type_error.307 if called on a `null` value; example: `"cannot use
    erase() with null"`
    @throw invalid_iterator.202 if called on an iterator which does not belong
    to the current JSON value; example: `"iterator does not fit current
    value"`
    @throw invalid_iterator.205 if called on a primitive type with invalid
    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
    out of range"`

    @complexity The complexity depends on the type:
    - objects: amortized constant
    - arrays: linear in distance between @a pos and the end of the container
    - strings and binary: linear in the length of the member
    - other types: constant

    @liveexample{The example shows the result of `erase()` for different JSON
    types.,erase__IteratorType}

    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
    the given range
    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
    from an object at the given key
    @sa see @ref erase(const size_type) -- removes the element from an array at
    the given index

    @since version 1.0.0
    */
    template < class IteratorType, typename std::enable_if <
                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
               = 0 >
    IteratorType erase(IteratorType pos)
    {
        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
        }

        IteratorType result = end();

        switch (m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            case value_t::binary:
            {
                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
                {
                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", *this));
                }

                if (is_string())
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
                    m_value.string = nullptr;
                }
                else if (is_binary())
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
                    m_value.binary = nullptr;
                }

                m_type = value_t::null;
                assert_invariant();
                break;
            }

            case value_t::object:
            {
                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
                break;
            }

            default:
                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
        }

        return result;
    }

    /*!
    @brief remove elements given an iterator range

    Removes the element specified by the range `[first; last)`. The iterator
    @a first does not need to be dereferenceable if `first == last`: erasing
    an empty range is a no-op.

    If called on a primitive type other than `null`, the resulting JSON value
    will be `null`.

    @param[in] first iterator to the beginning of the range to remove
    @param[in] last iterator past the end of the range to remove
    @return Iterator following the last removed element. If the iterator @a
    second refers to the last element, the `end()` iterator is returned.

    @tparam IteratorType an @ref iterator or @ref const_iterator

    @post Invalidates iterators and references at or after the point of the
    erase, including the `end()` iterator.

    @throw type_error.307 if called on a `null` value; example: `"cannot use
    erase() with null"`
    @throw invalid_iterator.203 if called on iterators which does not belong
    to the current JSON value; example: `"iterators do not fit current value"`
    @throw invalid_iterator.204 if called on a primitive type with invalid
    iterators (i.e., if `first != begin()` and `last != end()`); example:
    `"iterators out of range"`

    @complexity The complexity depends on the type:
    - objects: `log(size()) + std::distance(first, last)`
    - arrays: linear in the distance between @a first and @a last, plus linear
      in the distance between @a last and end of the container
    - strings and binary: linear in the length of the member
    - other types: constant

    @liveexample{The example shows the result of `erase()` for different JSON
    types.,erase__IteratorType_IteratorType}

    @sa see @ref erase(IteratorType) -- removes the element at a given position
    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
    from an object at the given key
    @sa see @ref erase(const size_type) -- removes the element from an array at
    the given index

    @since version 1.0.0
    */
    template < class IteratorType, typename std::enable_if <
                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
               = 0 >
    IteratorType erase(IteratorType first, IteratorType last)
    {
        // make sure iterator fits the current value
        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", *this));
        }

        IteratorType result = end();

        switch (m_type)
        {
            case value_t::boolean:
            case value_t::number_float:
            case value_t::number_integer:
            case value_t::number_unsigned:
            case value_t::string:
            case value_t::binary:
            {
                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
                                       || !last.m_it.primitive_iterator.is_end()))
                {
                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", *this));
                }

                if (is_string())
                {
                    AllocatorType<string_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
                    m_value.string = nullptr;
                }
                else if (is_binary())
                {
                    AllocatorType<binary_t> alloc;
                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
                    m_value.binary = nullptr;
                }

                m_type = value_t::null;
                assert_invariant();
                break;
            }

            case value_t::object:
            {
                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
                                              last.m_it.object_iterator);
                break;
            }

            case value_t::array:
            {
                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
                                             last.m_it.array_iterator);
                break;
            }

            default:
                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
        }

        return result;
    }

    /*!
    @brief remove element from a JSON object given a key

    Removes elements from a JSON object with the key value @a key.

    @param[in] key value of the elements to remove

    @return Number of elements removed. If @a ObjectType is the default
    `std::map` type, the return value will always be `0` (@a key was not
    found) or `1` (@a key was found).

    @post References and iterators to the erased elements are invalidated.
    Other references and iterators are not affected.

    @throw type_error.307 when called on a type other than JSON object;
    example: `"cannot use erase() with null"`

    @complexity `log(size()) + count(key)`

    @liveexample{The example shows the effect of `erase()`.,erase__key_type}

    @sa see @ref erase(IteratorType) -- removes the element at a given position
    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
    the given range
    @sa see @ref erase(const size_type) -- removes the element from an array at
    the given index

    @since version 1.0.0
    */
    size_type erase(const typename object_t::key_type& key)
    {
        // this erase only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            return m_value.object->erase(key);
        }

        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
    }

    /*!
    @brief remove element from a JSON array given an index

    Removes element from a JSON array at the index @a idx.

    @param[in] idx index of the element to remove

    @throw type_error.307 when called on a type other than JSON object;
    example: `"cannot use erase() with null"`
    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
    is out of range"`

    @complexity Linear in distance between @a idx and the end of the container.

    @liveexample{The example shows the effect of `erase()`.,erase__size_type}

    @sa see @ref erase(IteratorType) -- removes the element at a given position
    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
    the given range
    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
    from an object at the given key

    @since version 1.0.0
    */
    void erase(const size_type idx)
    {
        // this erase only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
            {
                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
            }

            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
        }
        else
        {
            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
        }
    }

    /// @}


    ////////////
    // lookup //
    ////////////

    /// @name lookup
    /// @{

    /*!
    @brief find an element in a JSON object

    Finds an element in a JSON object with key equivalent to @a key. If the
    element is not found or the JSON value is not an object, end() is
    returned.

    @note This method always returns @ref end() when executed on a JSON type
          that is not an object.

    @param[in] key key value of the element to search for.

    @return Iterator to an element with key equivalent to @a key. If no such
    element is found or the JSON value is not an object, past-the-end (see
    @ref end()) iterator is returned.

    @complexity Logarithmic in the size of the JSON object.

    @liveexample{The example shows how `find()` is used.,find__key_type}

    @sa see @ref contains(KeyT&&) const -- checks whether a key exists

    @since version 1.0.0
    */
    template<typename KeyT>
    iterator find(KeyT&& key)
    {
        auto result = end();

        if (is_object())
        {
            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
        }

        return result;
    }

    /*!
    @brief find an element in a JSON object
    @copydoc find(KeyT&&)
    */
    template<typename KeyT>
    const_iterator find(KeyT&& key) const
    {
        auto result = cend();

        if (is_object())
        {
            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
        }

        return result;
    }

    /*!
    @brief returns the number of occurrences of a key in a JSON object

    Returns the number of elements with key @a key. If ObjectType is the
    default `std::map` type, the return value will always be `0` (@a key was
    not found) or `1` (@a key was found).

    @note This method always returns `0` when executed on a JSON type that is
          not an object.

    @param[in] key key value of the element to count

    @return Number of elements with key @a key. If the JSON value is not an
    object, the return value will be `0`.

    @complexity Logarithmic in the size of the JSON object.

    @liveexample{The example shows how `count()` is used.,count}

    @since version 1.0.0
    */
    template<typename KeyT>
    size_type count(KeyT&& key) const
    {
        // return 0 for all nonobject types
        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
    }

    /*!
    @brief check the existence of an element in a JSON object

    Check whether an element exists in a JSON object with key equivalent to
    @a key. If the element is not found or the JSON value is not an object,
    false is returned.

    @note This method always returns false when executed on a JSON type
          that is not an object.

    @param[in] key key value to check its existence.

    @return true if an element with specified @a key exists. If no such
    element with such key is found or the JSON value is not an object,
    false is returned.

    @complexity Logarithmic in the size of the JSON object.

    @liveexample{The following code shows an example for `contains()`.,contains}

    @sa see @ref find(KeyT&&) -- returns an iterator to an object element
    @sa see @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer

    @since version 3.6.0
    */
    template < typename KeyT, typename std::enable_if <
                   !std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int >::type = 0 >
    bool contains(KeyT && key) const
    {
        return is_object() && m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
    }

    /*!
    @brief check the existence of an element in a JSON object given a JSON pointer

    Check whether the given JSON pointer @a ptr can be resolved in the current
    JSON value.

    @note This method can be executed on any JSON value type.

    @param[in] ptr JSON pointer to check its existence.

    @return true if the JSON pointer can be resolved to a stored value, false
    otherwise.

    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number

    @complexity Logarithmic in the size of the JSON object.

    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}

    @sa see @ref contains(KeyT &&) const -- checks the existence of a key

    @since version 3.7.0
    */
    bool contains(const json_pointer& ptr) const
    {
        return ptr.contains(this);
    }

    /// @}


    ///////////////
    // iterators //
    ///////////////

    /// @name iterators
    /// @{

    /*!
    @brief returns an iterator to the first element

    Returns an iterator to the first element.

    @image html range-begin-end.svg "Illustration from cppreference.com"

    @return iterator to the first element

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.

    @liveexample{The following code shows an example for `begin()`.,begin}

    @sa see @ref cbegin() -- returns a const iterator to the beginning
    @sa see @ref end() -- returns an iterator to the end
    @sa see @ref cend() -- returns a const iterator to the end

    @since version 1.0.0
    */
    iterator begin() noexcept
    {
        iterator result(this);
        result.set_begin();
        return result;
    }

    /*!
    @copydoc basic_json::cbegin()
    */
    const_iterator begin() const noexcept
    {
        return cbegin();
    }

    /*!
    @brief returns a const iterator to the first element

    Returns a const iterator to the first element.

    @image html range-begin-end.svg "Illustration from cppreference.com"

    @return const iterator to the first element

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.
    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.

    @liveexample{The following code shows an example for `cbegin()`.,cbegin}

    @sa see @ref begin() -- returns an iterator to the beginning
    @sa see @ref end() -- returns an iterator to the end
    @sa see @ref cend() -- returns a const iterator to the end

    @since version 1.0.0
    */
    const_iterator cbegin() const noexcept
    {
        const_iterator result(this);
        result.set_begin();
        return result;
    }

    /*!
    @brief returns an iterator to one past the last element

    Returns an iterator to one past the last element.

    @image html range-begin-end.svg "Illustration from cppreference.com"

    @return iterator one past the last element

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.

    @liveexample{The following code shows an example for `end()`.,end}

    @sa see @ref cend() -- returns a const iterator to the end
    @sa see @ref begin() -- returns an iterator to the beginning
    @sa see @ref cbegin() -- returns a const iterator to the beginning

    @since version 1.0.0
    */
    iterator end() noexcept
    {
        iterator result(this);
        result.set_end();
        return result;
    }

    /*!
    @copydoc basic_json::cend()
    */
    const_iterator end() const noexcept
    {
        return cend();
    }

    /*!
    @brief returns a const iterator to one past the last element

    Returns a const iterator to one past the last element.

    @image html range-begin-end.svg "Illustration from cppreference.com"

    @return const iterator one past the last element

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.
    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.

    @liveexample{The following code shows an example for `cend()`.,cend}

    @sa see @ref end() -- returns an iterator to the end
    @sa see @ref begin() -- returns an iterator to the beginning
    @sa see @ref cbegin() -- returns a const iterator to the beginning

    @since version 1.0.0
    */
    const_iterator cend() const noexcept
    {
        const_iterator result(this);
        result.set_end();
        return result;
    }

    /*!
    @brief returns an iterator to the reverse-beginning

    Returns an iterator to the reverse-beginning; that is, the last element.

    @image html range-rbegin-rend.svg "Illustration from cppreference.com"

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
    requirements:
    - The complexity is constant.
    - Has the semantics of `reverse_iterator(end())`.

    @liveexample{The following code shows an example for `rbegin()`.,rbegin}

    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning
    @sa see @ref rend() -- returns a reverse iterator to the end
    @sa see @ref crend() -- returns a const reverse iterator to the end

    @since version 1.0.0
    */
    reverse_iterator rbegin() noexcept
    {
        return reverse_iterator(end());
    }

    /*!
    @copydoc basic_json::crbegin()
    */
    const_reverse_iterator rbegin() const noexcept
    {
        return crbegin();
    }

    /*!
    @brief returns an iterator to the reverse-end

    Returns an iterator to the reverse-end; that is, one before the first
    element.

    @image html range-rbegin-rend.svg "Illustration from cppreference.com"

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
    requirements:
    - The complexity is constant.
    - Has the semantics of `reverse_iterator(begin())`.

    @liveexample{The following code shows an example for `rend()`.,rend}

    @sa see @ref crend() -- returns a const reverse iterator to the end
    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning

    @since version 1.0.0
    */
    reverse_iterator rend() noexcept
    {
        return reverse_iterator(begin());
    }

    /*!
    @copydoc basic_json::crend()
    */
    const_reverse_iterator rend() const noexcept
    {
        return crend();
    }

    /*!
    @brief returns a const reverse iterator to the last element

    Returns a const iterator to the reverse-beginning; that is, the last
    element.

    @image html range-rbegin-rend.svg "Illustration from cppreference.com"

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
    requirements:
    - The complexity is constant.
    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.

    @liveexample{The following code shows an example for `crbegin()`.,crbegin}

    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
    @sa see @ref rend() -- returns a reverse iterator to the end
    @sa see @ref crend() -- returns a const reverse iterator to the end

    @since version 1.0.0
    */
    const_reverse_iterator crbegin() const noexcept
    {
        return const_reverse_iterator(cend());
    }

    /*!
    @brief returns a const reverse iterator to one before the first

    Returns a const reverse iterator to the reverse-end; that is, one before
    the first element.

    @image html range-rbegin-rend.svg "Illustration from cppreference.com"

    @complexity Constant.

    @requirement This function helps `basic_json` satisfying the
    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
    requirements:
    - The complexity is constant.
    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.

    @liveexample{The following code shows an example for `crend()`.,crend}

    @sa see @ref rend() -- returns a reverse iterator to the end
    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning

    @since version 1.0.0
    */
    const_reverse_iterator crend() const noexcept
    {
        return const_reverse_iterator(cbegin());
    }

  public:
    /*!
    @brief wrapper to access iterator member functions in range-based for

    This function allows to access @ref iterator::key() and @ref
    iterator::value() during range-based for loops. In these loops, a
    reference to the JSON values is returned, so there is no access to the
    underlying iterator.

    For loop without iterator_wrapper:

    @code{cpp}
    for (auto it = j_object.begin(); it != j_object.end(); ++it)
    {
        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
    }
    @endcode

    Range-based for loop without iterator proxy:

    @code{cpp}
    for (auto it : j_object)
    {
        // "it" is of type json::reference and has no key() member
        std::cout << "value: " << it << '\n';
    }
    @endcode

    Range-based for loop with iterator proxy:

    @code{cpp}
    for (auto it : json::iterator_wrapper(j_object))
    {
        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
    }
    @endcode

    @note When iterating over an array, `key()` will return the index of the
          element as string (see example).

    @param[in] ref  reference to a JSON value
    @return iteration proxy object wrapping @a ref with an interface to use in
            range-based for loops

    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @note The name of this function is not yet final and may change in the
    future.

    @deprecated This stream operator is deprecated and will be removed in
                future 4.0.0 of the library. Please use @ref items() instead;
                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
    */
    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
    {
        return ref.items();
    }

    /*!
    @copydoc iterator_wrapper(reference)
    */
    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
    {
        return ref.items();
    }

    /*!
    @brief helper to access iterator member functions in range-based for

    This function allows to access @ref iterator::key() and @ref
    iterator::value() during range-based for loops. In these loops, a
    reference to the JSON values is returned, so there is no access to the
    underlying iterator.

    For loop without `items()` function:

    @code{cpp}
    for (auto it = j_object.begin(); it != j_object.end(); ++it)
    {
        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
    }
    @endcode

    Range-based for loop without `items()` function:

    @code{cpp}
    for (auto it : j_object)
    {
        // "it" is of type json::reference and has no key() member
        std::cout << "value: " << it << '\n';
    }
    @endcode

    Range-based for loop with `items()` function:

    @code{cpp}
    for (auto& el : j_object.items())
    {
        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
    }
    @endcode

    The `items()` function also allows to use
    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
    (C++17):

    @code{cpp}
    for (auto& [key, val] : j_object.items())
    {
        std::cout << "key: " << key << ", value:" << val << '\n';
    }
    @endcode

    @note When iterating over an array, `key()` will return the index of the
          element as string (see example). For primitive types (e.g., numbers),
          `key()` returns an empty string.

    @warning Using `items()` on temporary objects is dangerous. Make sure the
             object's lifetime exeeds the iteration. See
             <https://github.com/nlohmann/json/issues/2040> for more
             information.

    @return iteration proxy object wrapping @a ref with an interface to use in
            range-based for loops

    @liveexample{The following code shows how the function is used.,items}

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @since version 3.1.0, structured bindings support since 3.5.0.
    */
    iteration_proxy<iterator> items() noexcept
    {
        return iteration_proxy<iterator>(*this);
    }

    /*!
    @copydoc items()
    */
    iteration_proxy<const_iterator> items() const noexcept
    {
        return iteration_proxy<const_iterator>(*this);
    }

    /// @}


    //////////////
    // capacity //
    //////////////

    /// @name capacity
    /// @{

    /*!
    @brief checks whether the container is empty.

    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).

    @return The return value depends on the different types and is
            defined as follows:
            Value type  | return value
            ----------- | -------------
            null        | `true`
            boolean     | `false`
            string      | `false`
            number      | `false`
            binary      | `false`
            object      | result of function `object_t::empty()`
            array       | result of function `array_t::empty()`

    @liveexample{The following code uses `empty()` to check if a JSON
    object contains any elements.,empty}

    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
    the Container concept; that is, their `empty()` functions have constant
    complexity.

    @iterators No changes.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @note This function does not return whether a string stored as JSON value
    is empty - it returns whether the JSON container itself is empty which is
    false in the case of a string.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.
    - Has the semantics of `begin() == end()`.

    @sa see @ref size() -- returns the number of elements

    @since version 1.0.0
    */
    bool empty() const noexcept
    {
        switch (m_type)
        {
            case value_t::null:
            {
                // null values are empty
                return true;
            }

            case value_t::array:
            {
                // delegate call to array_t::empty()
                return m_value.array->empty();
            }

            case value_t::object:
            {
                // delegate call to object_t::empty()
                return m_value.object->empty();
            }

            default:
            {
                // all other types are nonempty
                return false;
            }
        }
    }

    /*!
    @brief returns the number of elements

    Returns the number of elements in a JSON value.

    @return The return value depends on the different types and is
            defined as follows:
            Value type  | return value
            ----------- | -------------
            null        | `0`
            boolean     | `1`
            string      | `1`
            number      | `1`
            binary      | `1`
            object      | result of function object_t::size()
            array       | result of function array_t::size()

    @liveexample{The following code calls `size()` on the different value
    types.,size}

    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
    the Container concept; that is, their size() functions have constant
    complexity.

    @iterators No changes.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @note This function does not return the length of a string stored as JSON
    value - it returns the number of elements in the JSON value which is 1 in
    the case of a string.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.
    - Has the semantics of `std::distance(begin(), end())`.

    @sa see @ref empty() -- checks whether the container is empty
    @sa see @ref max_size() -- returns the maximal number of elements

    @since version 1.0.0
    */
    size_type size() const noexcept
    {
        switch (m_type)
        {
            case value_t::null:
            {
                // null values are empty
                return 0;
            }

            case value_t::array:
            {
                // delegate call to array_t::size()
                return m_value.array->size();
            }

            case value_t::object:
            {
                // delegate call to object_t::size()
                return m_value.object->size();
            }

            default:
            {
                // all other types have size 1
                return 1;
            }
        }
    }

    /*!
    @brief returns the maximum possible number of elements

    Returns the maximum number of elements a JSON value is able to hold due to
    system or library implementation limitations, i.e. `std::distance(begin(),
    end())` for the JSON value.

    @return The return value depends on the different types and is
            defined as follows:
            Value type  | return value
            ----------- | -------------
            null        | `0` (same as `size()`)
            boolean     | `1` (same as `size()`)
            string      | `1` (same as `size()`)
            number      | `1` (same as `size()`)
            binary      | `1` (same as `size()`)
            object      | result of function `object_t::max_size()`
            array       | result of function `array_t::max_size()`

    @liveexample{The following code calls `max_size()` on the different value
    types. Note the output is implementation specific.,max_size}

    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
    the Container concept; that is, their `max_size()` functions have constant
    complexity.

    @iterators No changes.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @requirement This function helps `basic_json` satisfying the
    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
    requirements:
    - The complexity is constant.
    - Has the semantics of returning `b.size()` where `b` is the largest
      possible JSON value.

    @sa see @ref size() -- returns the number of elements

    @since version 1.0.0
    */
    size_type max_size() const noexcept
    {
        switch (m_type)
        {
            case value_t::array:
            {
                // delegate call to array_t::max_size()
                return m_value.array->max_size();
            }

            case value_t::object:
            {
                // delegate call to object_t::max_size()
                return m_value.object->max_size();
            }

            default:
            {
                // all other types have max_size() == size()
                return size();
            }
        }
    }

    /// @}


    ///////////////
    // modifiers //
    ///////////////

    /// @name modifiers
    /// @{

    /*!
    @brief clears the contents

    Clears the content of a JSON value and resets it to the default value as
    if @ref basic_json(value_t) would have been called with the current value
    type from @ref type():

    Value type  | initial value
    ----------- | -------------
    null        | `null`
    boolean     | `false`
    string      | `""`
    number      | `0`
    binary      | An empty byte vector
    object      | `{}`
    array       | `[]`

    @post Has the same effect as calling
    @code {.cpp}
    *this = basic_json(type());
    @endcode

    @liveexample{The example below shows the effect of `clear()` to different
    JSON types.,clear}

    @complexity Linear in the size of the JSON value.

    @iterators All iterators, pointers and references related to this container
               are invalidated.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @sa see @ref basic_json(value_t) -- constructor that creates an object with the
        same value than calling `clear()`

    @since version 1.0.0
    */
    void clear() noexcept
    {
        switch (m_type)
        {
            case value_t::number_integer:
            {
                m_value.number_integer = 0;
                break;
            }

            case value_t::number_unsigned:
            {
                m_value.number_unsigned = 0;
                break;
            }

            case value_t::number_float:
            {
                m_value.number_float = 0.0;
                break;
            }

            case value_t::boolean:
            {
                m_value.boolean = false;
                break;
            }

            case value_t::string:
            {
                m_value.string->clear();
                break;
            }

            case value_t::binary:
            {
                m_value.binary->clear();
                break;
            }

            case value_t::array:
            {
                m_value.array->clear();
                break;
            }

            case value_t::object:
            {
                m_value.object->clear();
                break;
            }

            default:
                break;
        }
    }

    /*!
    @brief add an object to an array

    Appends the given element @a val to the end of the JSON value. If the
    function is called on a JSON null value, an empty array is created before
    appending @a val.

    @param[in] val the value to add to the JSON array

    @throw type_error.308 when called on a type other than JSON array or
    null; example: `"cannot use push_back() with number"`

    @complexity Amortized constant.

    @liveexample{The example shows how `push_back()` and `+=` can be used to
    add elements to a JSON array. Note how the `null` value was silently
    converted to a JSON array.,push_back}

    @since version 1.0.0
    */
    void push_back(basic_json&& val)
    {
        // push_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_type = value_t::array;
            m_value = value_t::array;
            assert_invariant();
        }

        // add element to array (move semantics)
        m_value.array->push_back(std::move(val));
        set_parent(m_value.array->back());
        // if val is moved from, basic_json move constructor marks it null so we do not call the destructor
    }

    /*!
    @brief add an object to an array
    @copydoc push_back(basic_json&&)
    */
    reference operator+=(basic_json&& val)
    {
        push_back(std::move(val));
        return *this;
    }

    /*!
    @brief add an object to an array
    @copydoc push_back(basic_json&&)
    */
    void push_back(const basic_json& val)
    {
        // push_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_type = value_t::array;
            m_value = value_t::array;
            assert_invariant();
        }

        // add element to array
        m_value.array->push_back(val);
        set_parent(m_value.array->back());
    }

    /*!
    @brief add an object to an array
    @copydoc push_back(basic_json&&)
    */
    reference operator+=(const basic_json& val)
    {
        push_back(val);
        return *this;
    }

    /*!
    @brief add an object to an object

    Inserts the given element @a val to the JSON object. If the function is
    called on a JSON null value, an empty object is created before inserting
    @a val.

    @param[in] val the value to add to the JSON object

    @throw type_error.308 when called on a type other than JSON object or
    null; example: `"cannot use push_back() with number"`

    @complexity Logarithmic in the size of the container, O(log(`size()`)).

    @liveexample{The example shows how `push_back()` and `+=` can be used to
    add elements to a JSON object. Note how the `null` value was silently
    converted to a JSON object.,push_back__object_t__value}

    @since version 1.0.0
    */
    void push_back(const typename object_t::value_type& val)
    {
        // push_back only works for null objects or objects
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
        {
            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
        }

        // transform null object into an object
        if (is_null())
        {
            m_type = value_t::object;
            m_value = value_t::object;
            assert_invariant();
        }

        // add element to object
        auto res = m_value.object->insert(val);
        set_parent(res.first->second);
    }

    /*!
    @brief add an object to an object
    @copydoc push_back(const typename object_t::value_type&)
    */
    reference operator+=(const typename object_t::value_type& val)
    {
        push_back(val);
        return *this;
    }

    /*!
    @brief add an object to an object

    This function allows to use `push_back` with an initializer list. In case

    1. the current value is an object,
    2. the initializer list @a init contains only two elements, and
    3. the first element of @a init is a string,

    @a init is converted into an object element and added using
    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
    is converted to a JSON value and added using @ref push_back(basic_json&&).

    @param[in] init  an initializer list

    @complexity Linear in the size of the initializer list @a init.

    @note This function is required to resolve an ambiguous overload error,
          because pairs like `{"key", "value"}` can be both interpreted as
          `object_t::value_type` or `std::initializer_list<basic_json>`, see
          https://github.com/nlohmann/json/issues/235 for more information.

    @liveexample{The example shows how initializer lists are treated as
    objects when possible.,push_back__initializer_list}
    */
    void push_back(initializer_list_t init)
    {
        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
        {
            basic_json&& key = init.begin()->moved_or_copied();
            push_back(typename object_t::value_type(
                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
        }
        else
        {
            push_back(basic_json(init));
        }
    }

    /*!
    @brief add an object to an object
    @copydoc push_back(initializer_list_t)
    */
    reference operator+=(initializer_list_t init)
    {
        push_back(init);
        return *this;
    }

    /*!
    @brief add an object to an array

    Creates a JSON value from the passed parameters @a args to the end of the
    JSON value. If the function is called on a JSON null value, an empty array
    is created before appending the value created from @a args.

    @param[in] args arguments to forward to a constructor of @ref basic_json
    @tparam Args compatible types to create a @ref basic_json object

    @return reference to the inserted element

    @throw type_error.311 when called on a type other than JSON array or
    null; example: `"cannot use emplace_back() with number"`

    @complexity Amortized constant.

    @liveexample{The example shows how `push_back()` can be used to add
    elements to a JSON array. Note how the `null` value was silently converted
    to a JSON array.,emplace_back}

    @since version 2.0.8, returns reference since 3.7.0
    */
    template<class... Args>
    reference emplace_back(Args&& ... args)
    {
        // emplace_back only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
        {
            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name()), *this));
        }

        // transform null object into an array
        if (is_null())
        {
            m_type = value_t::array;
            m_value = value_t::array;
            assert_invariant();
        }

        // add element to array (perfect forwarding)
#ifdef JSON_HAS_CPP_17
        return set_parent(m_value.array->emplace_back(std::forward<Args>(args)...));
#else
        m_value.array->emplace_back(std::forward<Args>(args)...);
        return set_parent(m_value.array->back());
#endif
    }

    /*!
    @brief add an object to an object if key does not exist

    Inserts a new element into a JSON object constructed in-place with the
    given @a args if there is no element with the key in the container. If the
    function is called on a JSON null value, an empty object is created before
    appending the value created from @a args.

    @param[in] args arguments to forward to a constructor of @ref basic_json
    @tparam Args compatible types to create a @ref basic_json object

    @return a pair consisting of an iterator to the inserted element, or the
            already-existing element if no insertion happened, and a bool
            denoting whether the insertion took place.

    @throw type_error.311 when called on a type other than JSON object or
    null; example: `"cannot use emplace() with number"`

    @complexity Logarithmic in the size of the container, O(log(`size()`)).

    @liveexample{The example shows how `emplace()` can be used to add elements
    to a JSON object. Note how the `null` value was silently converted to a
    JSON object. Further note how no value is added if there was already one
    value stored with the same key.,emplace}

    @since version 2.0.8
    */
    template<class... Args>
    std::pair<iterator, bool> emplace(Args&& ... args)
    {
        // emplace only works for null objects or arrays
        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
        {
            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name()), *this));
        }

        // transform null object into an object
        if (is_null())
        {
            m_type = value_t::object;
            m_value = value_t::object;
            assert_invariant();
        }

        // add element to array (perfect forwarding)
        auto res = m_value.object->emplace(std::forward<Args>(args)...);
        set_parent(res.first->second);

        // create result iterator and set iterator to the result of emplace
        auto it = begin();
        it.m_it.object_iterator = res.first;

        // return pair of iterator and boolean
        return {it, res.second};
    }

    /// Helper for insertion of an iterator
    /// @note: This uses std::distance to support GCC 4.8,
    ///        see https://github.com/nlohmann/json/pull/1257
    template<typename... Args>
    iterator insert_iterator(const_iterator pos, Args&& ... args)
    {
        iterator result(this);
        JSON_ASSERT(m_value.array != nullptr);

        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
        result.m_it.array_iterator = m_value.array->begin() + insert_pos;

        // This could have been written as:
        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.

        return result;
    }

    /*!
    @brief inserts element

    Inserts element @a val before iterator @a pos.

    @param[in] pos iterator before which the content will be inserted; may be
    the end() iterator
    @param[in] val element to insert
    @return iterator pointing to the inserted @a val.

    @throw type_error.309 if called on JSON values other than arrays;
    example: `"cannot use insert() with string"`
    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
    example: `"iterator does not fit current value"`

    @complexity Constant plus linear in the distance between @a pos and end of
    the container.

    @liveexample{The example shows how `insert()` is used.,insert}

    @since version 1.0.0
    */
    iterator insert(const_iterator pos, const basic_json& val)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // check if iterator pos fits to this JSON value
            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
            {
                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
            }

            // insert to array and return iterator
            return set_parents(insert_iterator(pos, val), static_cast<typename iterator::difference_type>(1));
        }

        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
    }

    /*!
    @brief inserts element
    @copydoc insert(const_iterator, const basic_json&)
    */
    iterator insert(const_iterator pos, basic_json&& val)
    {
        return insert(pos, val);
    }

    /*!
    @brief inserts elements

    Inserts @a cnt copies of @a val before iterator @a pos.

    @param[in] pos iterator before which the content will be inserted; may be
    the end() iterator
    @param[in] cnt number of copies of @a val to insert
    @param[in] val element to insert
    @return iterator pointing to the first element inserted, or @a pos if
    `cnt==0`

    @throw type_error.309 if called on JSON values other than arrays; example:
    `"cannot use insert() with string"`
    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
    example: `"iterator does not fit current value"`

    @complexity Linear in @a cnt plus linear in the distance between @a pos
    and end of the container.

    @liveexample{The example shows how `insert()` is used.,insert__count}

    @since version 1.0.0
    */
    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            // check if iterator pos fits to this JSON value
            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
            {
                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
            }

            // insert to array and return iterator
            return set_parents(insert_iterator(pos, cnt, val), static_cast<typename iterator::difference_type>(cnt));
        }

        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
    }

    /*!
    @brief inserts elements

    Inserts elements from range `[first, last)` before iterator @a pos.

    @param[in] pos iterator before which the content will be inserted; may be
    the end() iterator
    @param[in] first begin of the range of elements to insert
    @param[in] last end of the range of elements to insert

    @throw type_error.309 if called on JSON values other than arrays; example:
    `"cannot use insert() with string"`
    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
    example: `"iterator does not fit current value"`
    @throw invalid_iterator.210 if @a first and @a last do not belong to the
    same JSON value; example: `"iterators do not fit"`
    @throw invalid_iterator.211 if @a first or @a last are iterators into
    container for which insert is called; example: `"passed iterators may not
    belong to container"`

    @return iterator pointing to the first element inserted, or @a pos if
    `first==last`

    @complexity Linear in `std::distance(first, last)` plus linear in the
    distance between @a pos and end of the container.

    @liveexample{The example shows how `insert()` is used.,insert__range}

    @since version 1.0.0
    */
    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_UNLIKELY(!is_array()))
        {
            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
        }

        // check if iterator pos fits to this JSON value
        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
        }

        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
        {
            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", *this));
        }

        // insert to array and return iterator
        return set_parents(insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator), std::distance(first, last));
    }

    /*!
    @brief inserts elements

    Inserts elements from initializer list @a ilist before iterator @a pos.

    @param[in] pos iterator before which the content will be inserted; may be
    the end() iterator
    @param[in] ilist initializer list to insert the values from

    @throw type_error.309 if called on JSON values other than arrays; example:
    `"cannot use insert() with string"`
    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
    example: `"iterator does not fit current value"`

    @return iterator pointing to the first element inserted, or @a pos if
    `ilist` is empty

    @complexity Linear in `ilist.size()` plus linear in the distance between
    @a pos and end of the container.

    @liveexample{The example shows how `insert()` is used.,insert__ilist}

    @since version 1.0.0
    */
    iterator insert(const_iterator pos, initializer_list_t ilist)
    {
        // insert only works for arrays
        if (JSON_HEDLEY_UNLIKELY(!is_array()))
        {
            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
        }

        // check if iterator pos fits to this JSON value
        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
        {
            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
        }

        // insert to array and return iterator
        return set_parents(insert_iterator(pos, ilist.begin(), ilist.end()), static_cast<typename iterator::difference_type>(ilist.size()));
    }

    /*!
    @brief inserts elements

    Inserts elements from range `[first, last)`.

    @param[in] first begin of the range of elements to insert
    @param[in] last end of the range of elements to insert

    @throw type_error.309 if called on JSON values other than objects; example:
    `"cannot use insert() with string"`
    @throw invalid_iterator.202 if iterator @a first or @a last does does not
    point to an object; example: `"iterators first and last must point to
    objects"`
    @throw invalid_iterator.210 if @a first and @a last do not belong to the
    same JSON value; example: `"iterators do not fit"`

    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
    of elements to insert.

    @liveexample{The example shows how `insert()` is used.,insert__range_object}

    @since version 3.0.0
    */
    void insert(const_iterator first, const_iterator last)
    {
        // insert only works for objects
        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
        }

        // passed iterators must belong to objects
        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
        {
            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this));
        }

        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
    }

    /*!
    @brief updates a JSON object from another object, overwriting existing keys

    Inserts all values from JSON object @a j and overwrites existing keys.

    @param[in] j  JSON object to read values from

    @throw type_error.312 if called on JSON values other than objects; example:
    `"cannot use update() with string"`

    @complexity O(N*log(size() + N)), where N is the number of elements to
                insert.

    @liveexample{The example shows how `update()` is used.,update}

    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update

    @since version 3.0.0
    */
    void update(const_reference j)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_type = value_t::object;
            m_value.object = create<object_t>();
            assert_invariant();
        }

        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this));
        }
        if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
        {
            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name()), *this));
        }

        for (auto it = j.cbegin(); it != j.cend(); ++it)
        {
            m_value.object->operator[](it.key()) = it.value();
        }
    }

    /*!
    @brief updates a JSON object from another object, overwriting existing keys

    Inserts all values from from range `[first, last)` and overwrites existing
    keys.

    @param[in] first begin of the range of elements to insert
    @param[in] last end of the range of elements to insert

    @throw type_error.312 if called on JSON values other than objects; example:
    `"cannot use update() with string"`
    @throw invalid_iterator.202 if iterator @a first or @a last does does not
    point to an object; example: `"iterators first and last must point to
    objects"`
    @throw invalid_iterator.210 if @a first and @a last do not belong to the
    same JSON value; example: `"iterators do not fit"`

    @complexity O(N*log(size() + N)), where N is the number of elements to
                insert.

    @liveexample{The example shows how `update()` is used__range.,update}

    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update

    @since version 3.0.0
    */
    void update(const_iterator first, const_iterator last)
    {
        // implicitly convert null value to an empty object
        if (is_null())
        {
            m_type = value_t::object;
            m_value.object = create<object_t>();
            assert_invariant();
        }

        if (JSON_HEDLEY_UNLIKELY(!is_object()))
        {
            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this));
        }

        // check if range iterators belong to the same JSON object
        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
        {
            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
        }

        // passed iterators must belong to objects
        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()
                                 || !last.m_object->is_object()))
        {
            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this));
        }

        for (auto it = first; it != last; ++it)
        {
            m_value.object->operator[](it.key()) = it.value();
        }
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of the JSON value with those of @a other. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated.

    @param[in,out] other JSON value to exchange the contents with

    @complexity Constant.

    @liveexample{The example below shows how JSON values can be swapped with
    `swap()`.,swap__reference}

    @since version 1.0.0
    */
    void swap(reference other) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&&
        std::is_nothrow_move_assignable<json_value>::value
    )
    {
        std::swap(m_type, other.m_type);
        std::swap(m_value, other.m_value);

        set_parents();
        other.set_parents();
        assert_invariant();
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of the JSON value from @a left with those of @a right. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated. implemented as a friend function callable via ADL.

    @param[in,out] left JSON value to exchange the contents with
    @param[in,out] right JSON value to exchange the contents with

    @complexity Constant.

    @liveexample{The example below shows how JSON values can be swapped with
    `swap()`.,swap__reference}

    @since version 1.0.0
    */
    friend void swap(reference left, reference right) noexcept (
        std::is_nothrow_move_constructible<value_t>::value&&
        std::is_nothrow_move_assignable<value_t>::value&&
        std::is_nothrow_move_constructible<json_value>::value&&
        std::is_nothrow_move_assignable<json_value>::value
    )
    {
        left.swap(right);
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of a JSON array with those of @a other. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated.

    @param[in,out] other array to exchange the contents with

    @throw type_error.310 when JSON value is not an array; example: `"cannot
    use swap() with string"`

    @complexity Constant.

    @liveexample{The example below shows how arrays can be swapped with
    `swap()`.,swap__array_t}

    @since version 1.0.0
    */
    void swap(array_t& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for arrays
        if (JSON_HEDLEY_LIKELY(is_array()))
        {
            std::swap(*(m_value.array), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of a JSON object with those of @a other. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated.

    @param[in,out] other object to exchange the contents with

    @throw type_error.310 when JSON value is not an object; example:
    `"cannot use swap() with string"`

    @complexity Constant.

    @liveexample{The example below shows how objects can be swapped with
    `swap()`.,swap__object_t}

    @since version 1.0.0
    */
    void swap(object_t& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for objects
        if (JSON_HEDLEY_LIKELY(is_object()))
        {
            std::swap(*(m_value.object), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of a JSON string with those of @a other. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated.

    @param[in,out] other string to exchange the contents with

    @throw type_error.310 when JSON value is not a string; example: `"cannot
    use swap() with boolean"`

    @complexity Constant.

    @liveexample{The example below shows how strings can be swapped with
    `swap()`.,swap__string_t}

    @since version 1.0.0
    */
    void swap(string_t& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_string()))
        {
            std::swap(*(m_value.string), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
        }
    }

    /*!
    @brief exchanges the values

    Exchanges the contents of a JSON string with those of @a other. Does not
    invoke any move, copy, or swap operations on individual elements. All
    iterators and references remain valid. The past-the-end iterator is
    invalidated.

    @param[in,out] other binary to exchange the contents with

    @throw type_error.310 when JSON value is not a string; example: `"cannot
    use swap() with boolean"`

    @complexity Constant.

    @liveexample{The example below shows how strings can be swapped with
    `swap()`.,swap__binary_t}

    @since version 3.8.0
    */
    void swap(binary_t& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_binary()))
        {
            std::swap(*(m_value.binary), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
        }
    }

    /// @copydoc swap(binary_t&)
    void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape)
    {
        // swap only works for strings
        if (JSON_HEDLEY_LIKELY(is_binary()))
        {
            std::swap(*(m_value.binary), other);
        }
        else
        {
            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
        }
    }

    /// @}

  public:
    //////////////////////////////////////////
    // lexicographical comparison operators //
    //////////////////////////////////////////

    /// @name lexicographical comparison operators
    /// @{

    /*!
    @brief comparison: equal

    Compares two JSON values for equality according to the following rules:
    - Two JSON values are equal if (1) they are from the same type and (2)
      their stored values are the same according to their respective
      `operator==`.
    - Integer and floating-point numbers are automatically converted before
      comparison. Note that two NaN values are always treated as unequal.
    - Two JSON null values are equal.

    @note Floating-point inside JSON values numbers are compared with
    `json::number_float_t::operator==` which is `double::operator==` by
    default. To compare floating-point while respecting an epsilon, an alternative
    [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39)
    could be used, for instance
    @code {.cpp}
    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
    {
        return std::abs(a - b) <= epsilon;
    }
    @endcode
    Or you can self-defined operator equal function like this:
    @code {.cpp}
    bool my_equal(const_reference lhs, const_reference rhs) {
    const auto lhs_type lhs.type();
    const auto rhs_type rhs.type();
    if (lhs_type == rhs_type) {
        switch(lhs_type)
            // self_defined case
            case value_t::number_float:
                return std::abs(lhs - rhs) <= std::numeric_limits<float>::epsilon();
            // other cases remain the same with the original
            ...
    }
    ...
    }
    @endcode

    @note NaN values never compare equal to themselves or to other NaN values.

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether the values @a lhs and @a rhs are equal

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @complexity Linear.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__equal}

    @since version 1.0.0
    */
    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
    {
        const auto lhs_type = lhs.type();
        const auto rhs_type = rhs.type();

        if (lhs_type == rhs_type)
        {
            switch (lhs_type)
            {
                case value_t::array:
                    return *lhs.m_value.array == *rhs.m_value.array;

                case value_t::object:
                    return *lhs.m_value.object == *rhs.m_value.object;

                case value_t::null:
                    return true;

                case value_t::string:
                    return *lhs.m_value.string == *rhs.m_value.string;

                case value_t::boolean:
                    return lhs.m_value.boolean == rhs.m_value.boolean;

                case value_t::number_integer:
                    return lhs.m_value.number_integer == rhs.m_value.number_integer;

                case value_t::number_unsigned:
                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;

                case value_t::number_float:
                    return lhs.m_value.number_float == rhs.m_value.number_float;

                case value_t::binary:
                    return *lhs.m_value.binary == *rhs.m_value.binary;

                default:
                    return false;
            }
        }
        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
        {
            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
        }
        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
        {
            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
        }
        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
        {
            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
        }
        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
        {
            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
        }
        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
        {
            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
        }
        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
        {
            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
        }

        return false;
    }

    /*!
    @brief comparison: equal
    @copydoc operator==(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator==(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs == basic_json(rhs);
    }

    /*!
    @brief comparison: equal
    @copydoc operator==(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator==(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) == rhs;
    }

    /*!
    @brief comparison: not equal

    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether the values @a lhs and @a rhs are not equal

    @complexity Linear.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__notequal}

    @since version 1.0.0
    */
    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
    {
        return !(lhs == rhs);
    }

    /*!
    @brief comparison: not equal
    @copydoc operator!=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs != basic_json(rhs);
    }

    /*!
    @brief comparison: not equal
    @copydoc operator!=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) != rhs;
    }

    /*!
    @brief comparison: less than

    Compares whether one JSON value @a lhs is less than another JSON value @a
    rhs according to the following rules:
    - If @a lhs and @a rhs have the same type, the values are compared using
      the default `<` operator.
    - Integer and floating-point numbers are automatically converted before
      comparison
    - In case @a lhs and @a rhs have different types, the values are ignored
      and the order of the types is considered, see
      @ref operator<(const value_t, const value_t).

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether @a lhs is less than @a rhs

    @complexity Linear.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__less}

    @since version 1.0.0
    */
    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
    {
        const auto lhs_type = lhs.type();
        const auto rhs_type = rhs.type();

        if (lhs_type == rhs_type)
        {
            switch (lhs_type)
            {
                case value_t::array:
                    // note parentheses are necessary, see
                    // https://github.com/nlohmann/json/issues/1530
                    return (*lhs.m_value.array) < (*rhs.m_value.array);

                case value_t::object:
                    return (*lhs.m_value.object) < (*rhs.m_value.object);

                case value_t::null:
                    return false;

                case value_t::string:
                    return (*lhs.m_value.string) < (*rhs.m_value.string);

                case value_t::boolean:
                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);

                case value_t::number_integer:
                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);

                case value_t::number_unsigned:
                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);

                case value_t::number_float:
                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);

                case value_t::binary:
                    return (*lhs.m_value.binary) < (*rhs.m_value.binary);

                default:
                    return false;
            }
        }
        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
        {
            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
        }
        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
        {
            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
        }
        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
        {
            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
        }
        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
        {
            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
        }
        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
        {
            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
        }
        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
        {
            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
        }

        // We only reach this line if we cannot compare values. In that case,
        // we compare types. Note we have to call the operator explicitly,
        // because MSVC has problems otherwise.
        return operator<(lhs_type, rhs_type);
    }

    /*!
    @brief comparison: less than
    @copydoc operator<(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs < basic_json(rhs);
    }

    /*!
    @brief comparison: less than
    @copydoc operator<(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) < rhs;
    }

    /*!
    @brief comparison: less than or equal

    Compares whether one JSON value @a lhs is less than or equal to another
    JSON value by calculating `not (rhs < lhs)`.

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether @a lhs is less than or equal to @a rhs

    @complexity Linear.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__greater}

    @since version 1.0.0
    */
    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
    {
        return !(rhs < lhs);
    }

    /*!
    @brief comparison: less than or equal
    @copydoc operator<=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs <= basic_json(rhs);
    }

    /*!
    @brief comparison: less than or equal
    @copydoc operator<=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) <= rhs;
    }

    /*!
    @brief comparison: greater than

    Compares whether one JSON value @a lhs is greater than another
    JSON value by calculating `not (lhs <= rhs)`.

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether @a lhs is greater than to @a rhs

    @complexity Linear.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__lessequal}

    @since version 1.0.0
    */
    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
    {
        return !(lhs <= rhs);
    }

    /*!
    @brief comparison: greater than
    @copydoc operator>(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs > basic_json(rhs);
    }

    /*!
    @brief comparison: greater than
    @copydoc operator>(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) > rhs;
    }

    /*!
    @brief comparison: greater than or equal

    Compares whether one JSON value @a lhs is greater than or equal to another
    JSON value by calculating `not (lhs < rhs)`.

    @param[in] lhs  first JSON value to consider
    @param[in] rhs  second JSON value to consider
    @return whether @a lhs is greater than or equal to @a rhs

    @complexity Linear.

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @liveexample{The example demonstrates comparing several JSON
    types.,operator__greaterequal}

    @since version 1.0.0
    */
    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
    {
        return !(lhs < rhs);
    }

    /*!
    @brief comparison: greater than or equal
    @copydoc operator>=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept
    {
        return lhs >= basic_json(rhs);
    }

    /*!
    @brief comparison: greater than or equal
    @copydoc operator>=(const_reference, const_reference)
    */
    template<typename ScalarType, typename std::enable_if<
                 std::is_scalar<ScalarType>::value, int>::type = 0>
    friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept
    {
        return basic_json(lhs) >= rhs;
    }

    /// @}

    ///////////////////
    // serialization //
    ///////////////////

    /// @name serialization
    /// @{

    /*!
    @brief serialize to stream

    Serialize the given JSON value @a j to the output stream @a o. The JSON
    value will be serialized using the @ref dump member function.

    - The indentation of the output can be controlled with the member variable
      `width` of the output stream @a o. For instance, using the manipulator
      `std::setw(4)` on @a o sets the indentation level to `4` and the
      serialization result is the same as calling `dump(4)`.

    - The indentation character can be controlled with the member variable
      `fill` of the output stream @a o. For instance, the manipulator
      `std::setfill('\\t')` sets indentation to use a tab character rather than
      the default space character.

    @param[in,out] o  stream to serialize to
    @param[in] j  JSON value to serialize

    @return the stream @a o

    @throw type_error.316 if a string stored inside the JSON value is not
                          UTF-8 encoded

    @complexity Linear.

    @liveexample{The example below shows the serialization with different
    parameters to `width` to adjust the indentation level.,operator_serialize}

    @since version 1.0.0; indentation character added in version 3.0.0
    */
    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
    {
        // read width member and use it as indentation parameter if nonzero
        const bool pretty_print = o.width() > 0;
        const auto indentation = pretty_print ? o.width() : 0;

        // reset width to 0 for subsequent calls to this stream
        o.width(0);

        // do the actual serialization
        serializer s(detail::output_adapter<char>(o), o.fill());
        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
        return o;
    }

    /*!
    @brief serialize to stream
    @deprecated This stream operator is deprecated and will be removed in
                future 4.0.0 of the library. Please use
                @ref operator<<(std::ostream&, const basic_json&)
                instead; that is, replace calls like `j >> o;` with `o << j;`.
    @since version 1.0.0; deprecated since version 3.0.0
    */
    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
    {
        return o << j;
    }

    /// @}


    /////////////////////
    // deserialization //
    /////////////////////

    /// @name deserialization
    /// @{

    /*!
    @brief deserialize from a compatible input

    @tparam InputType A compatible input, for instance
    - an std::istream object
    - a FILE pointer
    - a C-style array of characters
    - a pointer to a null-terminated string of single byte characters
    - an object obj for which begin(obj) and end(obj) produces a valid pair of
      iterators.

    @param[in] i  input to read from
    @param[in] cb  a parser callback function of type @ref parser_callback_t
    which is used to control the deserialization by filtering unwanted values
    (optional)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)
    @param[in] ignore_comments  whether comments should be ignored and treated
    like whitespace (true) or yield a parse error (true); (optional, false by
    default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
    of input; expected string literal""`
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails

    @complexity Linear in the length of the input. The parser is a predictive
    LL(1) parser. The complexity can be higher if the parser callback function
    @a cb or reading from the input @a i has a super-linear complexity.

    @note A UTF-8 byte order mark is silently ignored.

    @liveexample{The example below demonstrates the `parse()` function reading
    from an array.,parse__array__parser_callback_t}

    @liveexample{The example below demonstrates the `parse()` function with
    and without callback function.,parse__string__parser_callback_t}

    @liveexample{The example below demonstrates the `parse()` function with
    and without callback function.,parse__istream__parser_callback_t}

    @liveexample{The example below demonstrates the `parse()` function reading
    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}

    @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to
    ignore comments.
    */
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json parse(InputType&& i,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    /*!
    @brief deserialize from a pair of character iterators

    The value_type of the iterator must be a integral type with size of 1, 2 or
    4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32.

    @param[in] first iterator to start of character range
    @param[in] last  iterator to end of character range
    @param[in] cb  a parser callback function of type @ref parser_callback_t
    which is used to control the deserialization by filtering unwanted values
    (optional)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)
    @param[in] ignore_comments  whether comments should be ignored and treated
    like whitespace (true) or yield a parse error (true); (optional, false by
    default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
    of input; expected string literal""`
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails
    */
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json parse(IteratorType first,
                            IteratorType last,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
    static basic_json parse(detail::span_input_adapter&& i,
                            const parser_callback_t cb = nullptr,
                            const bool allow_exceptions = true,
                            const bool ignore_comments = false)
    {
        basic_json result;
        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
        return result;
    }

    /*!
    @brief check if the input is valid JSON

    Unlike the @ref parse(InputType&&, const parser_callback_t,const bool)
    function, this function neither throws an exception in case of invalid JSON
    input (i.e., a parse error) nor creates diagnostic information.

    @tparam InputType A compatible input, for instance
    - an std::istream object
    - a FILE pointer
    - a C-style array of characters
    - a pointer to a null-terminated string of single byte characters
    - an object obj for which begin(obj) and end(obj) produces a valid pair of
      iterators.

    @param[in] i input to read from
    @param[in] ignore_comments  whether comments should be ignored and treated
    like whitespace (true) or yield a parse error (true); (optional, false by
    default)

    @return Whether the input read from @a i is valid JSON.

    @complexity Linear in the length of the input. The parser is a predictive
    LL(1) parser.

    @note A UTF-8 byte order mark is silently ignored.

    @liveexample{The example below demonstrates the `accept()` function reading
    from a string.,accept__string}
    */
    template<typename InputType>
    static bool accept(InputType&& i,
                       const bool ignore_comments = false)
    {
        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
    }

    template<typename IteratorType>
    static bool accept(IteratorType first, IteratorType last,
                       const bool ignore_comments = false)
    {
        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
    static bool accept(detail::span_input_adapter&& i,
                       const bool ignore_comments = false)
    {
        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
    }

    /*!
    @brief generate SAX events

    The SAX event lister must follow the interface of @ref json_sax.

    This function reads from a compatible input. Examples are:
    - an std::istream object
    - a FILE pointer
    - a C-style array of characters
    - a pointer to a null-terminated string of single byte characters
    - an object obj for which begin(obj) and end(obj) produces a valid pair of
      iterators.

    @param[in] i  input to read from
    @param[in,out] sax  SAX event listener
    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
    @param[in] strict  whether the input has to be consumed completely
    @param[in] ignore_comments  whether comments should be ignored and treated
    like whitespace (true) or yield a parse error (true); (optional, false by
    default); only applies to the JSON file format.

    @return return value of the last processed SAX event

    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
    of input; expected string literal""`
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails

    @complexity Linear in the length of the input. The parser is a predictive
    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
    a super-linear complexity.

    @note A UTF-8 byte order mark is silently ignored.

    @liveexample{The example below demonstrates the `sax_parse()` function
    reading from string and processing the events with a user-defined SAX
    event consumer.,sax_parse}

    @since version 3.2.0
    */
    template <typename InputType, typename SAX>
    JSON_HEDLEY_NON_NULL(2)
    static bool sax_parse(InputType&& i, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        return format == input_format_t::json
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
    }

    template<class IteratorType, class SAX>
    JSON_HEDLEY_NON_NULL(3)
    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        return format == input_format_t::json
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
    }

    template <typename SAX>
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
    JSON_HEDLEY_NON_NULL(2)
    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
                          input_format_t format = input_format_t::json,
                          const bool strict = true,
                          const bool ignore_comments = false)
    {
        auto ia = i.get();
        return format == input_format_t::json
               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
    }

    /*!
    @brief deserialize from stream
    @deprecated This stream operator is deprecated and will be removed in
                version 4.0.0 of the library. Please use
                @ref operator>>(std::istream&, basic_json&)
                instead; that is, replace calls like `j << i;` with `i >> j;`.
    @since version 1.0.0; deprecated since version 3.0.0
    */
    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
    friend std::istream& operator<<(basic_json& j, std::istream& i)
    {
        return operator>>(i, j);
    }

    /*!
    @brief deserialize from stream

    Deserializes an input stream to a JSON value.

    @param[in,out] i  input stream to read a serialized JSON value from
    @param[in,out] j  JSON value to write the deserialized input to

    @throw parse_error.101 in case of an unexpected token
    @throw parse_error.102 if to_unicode fails or surrogate error
    @throw parse_error.103 if to_unicode fails

    @complexity Linear in the length of the input. The parser is a predictive
    LL(1) parser.

    @note A UTF-8 byte order mark is silently ignored.

    @liveexample{The example below shows how a JSON value is constructed by
    reading a serialization from a stream.,operator_deserialize}

    @sa parse(std::istream&, const parser_callback_t) for a variant with a
    parser callback function to filter values while parsing

    @since version 1.0.0
    */
    friend std::istream& operator>>(std::istream& i, basic_json& j)
    {
        parser(detail::input_adapter(i)).parse(false, j);
        return i;
    }

    /// @}

    ///////////////////////////
    // convenience functions //
    ///////////////////////////

    /*!
    @brief return the type as string

    Returns the type name as string to be used in error messages - usually to
    indicate that a function was called on a wrong JSON type.

    @return a string representation of a the @a m_type member:
            Value type  | return value
            ----------- | -------------
            null        | `"null"`
            boolean     | `"boolean"`
            string      | `"string"`
            number      | `"number"` (for all number types)
            object      | `"object"`
            array       | `"array"`
            binary      | `"binary"`
            discarded   | `"discarded"`

    @exceptionsafety No-throw guarantee: this function never throws exceptions.

    @complexity Constant.

    @liveexample{The following code exemplifies `type_name()` for all JSON
    types.,type_name}

    @sa see @ref type() -- return the type of the JSON value
    @sa see @ref operator value_t() -- return the type of the JSON value (implicit)

    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
    since 3.0.0
    */
    JSON_HEDLEY_RETURNS_NON_NULL
    const char* type_name() const noexcept
    {
        {
            switch (m_type)
            {
                case value_t::null:
                    return "null";
                case value_t::object:
                    return "object";
                case value_t::array:
                    return "array";
                case value_t::string:
                    return "string";
                case value_t::boolean:
                    return "boolean";
                case value_t::binary:
                    return "binary";
                case value_t::discarded:
                    return "discarded";
                default:
                    return "number";
            }
        }
    }


  JSON_PRIVATE_UNLESS_TESTED:
    //////////////////////
    // member variables //
    //////////////////////

    /// the type of the current element
    value_t m_type = value_t::null;

    /// the value of the current element
    json_value m_value = {};

#if JSON_DIAGNOSTICS
    /// a pointer to a parent value (for debugging purposes)
    basic_json* m_parent = nullptr;
#endif

    //////////////////////////////////////////
    // binary serialization/deserialization //
    //////////////////////////////////////////

    /// @name binary serialization/deserialization support
    /// @{

  public:
    /*!
    @brief create a CBOR serialization of a given JSON value

    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
    Binary Object Representation) serialization format. CBOR is a binary
    serialization format which aims to be more compact than JSON itself, yet
    more efficient to parse.

    The library uses the following mapping from JSON values types to
    CBOR types according to the CBOR specification (RFC 7049):

    JSON value type | value/range                                | CBOR type                          | first byte
    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
    null            | `null`                                     | Null                               | 0xF6
    boolean         | `true`                                     | True                               | 0xF5
    boolean         | `false`                                    | False                              | 0xF4
    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
    number_float    | *any value representable by a float*       | Single-Precision Float             | 0xFA
    number_float    | *any value NOT representable by a float*   | Double-Precision Float             | 0xFB
    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
    array           | *size*: 0..23                              | array                              | 0x80..0x97
    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
    binary          | *size*: 0..23                              | byte string                        | 0x40..0x57
    binary          | *size*: 23..255                            | byte string (1 byte follow)        | 0x58
    binary          | *size*: 256..65535                         | byte string (2 bytes follow)       | 0x59
    binary          | *size*: 65536..4294967295                  | byte string (4 bytes follow)       | 0x5A
    binary          | *size*: 4294967296..18446744073709551615   | byte string (8 bytes follow)       | 0x5B

    @note The mapping is **complete** in the sense that any JSON value type
          can be converted to a CBOR value.

    @note If NaN or Infinity are stored inside a JSON number, they are
          serialized properly. This behavior differs from the @ref dump()
          function which serializes NaN or Infinity to `null`.

    @note The following CBOR types are not used in the conversion:
          - UTF-8 strings terminated by "break" (0x7F)
          - arrays terminated by "break" (0x9F)
          - maps terminated by "break" (0xBF)
          - byte strings terminated by "break" (0x5F)
          - date/time (0xC0..0xC1)
          - bignum (0xC2..0xC3)
          - decimal fraction (0xC4)
          - bigfloat (0xC5)
          - expected conversions (0xD5..0xD7)
          - simple values (0xE0..0xF3, 0xF8)
          - undefined (0xF7)
          - half-precision floats (0xF9)
          - break (0xFF)

    @param[in] j  JSON value to serialize
    @return CBOR serialization as byte vector

    @complexity Linear in the size of the JSON value @a j.

    @liveexample{The example shows the serialization of a JSON value to a byte
    vector in CBOR format.,to_cbor}

    @sa http://cbor.io
    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
        analogous deserialization
    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format
    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
             related UBJSON format

    @since version 2.0.9; compact representation of floating-point numbers
           since version 3.8.0
    */
    static std::vector<uint8_t> to_cbor(const basic_json& j)
    {
        std::vector<uint8_t> result;
        to_cbor(j, result);
        return result;
    }

    static void to_cbor(const basic_json& j, detail::output_adapter<uint8_t> o)
    {
        binary_writer<uint8_t>(o).write_cbor(j);
    }

    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_cbor(j);
    }

    /*!
    @brief create a MessagePack serialization of a given JSON value

    Serializes a given JSON value @a j to a byte vector using the MessagePack
    serialization format. MessagePack is a binary serialization format which
    aims to be more compact than JSON itself, yet more efficient to parse.

    The library uses the following mapping from JSON values types to
    MessagePack types according to the MessagePack specification:

    JSON value type | value/range                       | MessagePack type | first byte
    --------------- | --------------------------------- | ---------------- | ----------
    null            | `null`                            | nil              | 0xC0
    boolean         | `true`                            | true             | 0xC3
    boolean         | `false`                           | false            | 0xC2
    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
    number_integer  | -2147483648..-32769               | int32            | 0xD2
    number_integer  | -32768..-129                      | int16            | 0xD1
    number_integer  | -128..-33                         | int8             | 0xD0
    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
    number_integer  | 128..255                          | uint 8           | 0xCC
    number_integer  | 256..65535                        | uint 16          | 0xCD
    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
    number_unsigned | 128..255                          | uint 8           | 0xCC
    number_unsigned | 256..65535                        | uint 16          | 0xCD
    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
    number_float    | *any value representable by a float*     | float 32 | 0xCA
    number_float    | *any value NOT representable by a float* | float 64 | 0xCB
    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
    string          | *length*: 32..255                 | str 8            | 0xD9
    string          | *length*: 256..65535              | str 16           | 0xDA
    string          | *length*: 65536..4294967295       | str 32           | 0xDB
    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
    array           | *size*: 16..65535                 | array 16         | 0xDC
    array           | *size*: 65536..4294967295         | array 32         | 0xDD
    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
    object          | *size*: 16..65535                 | map 16           | 0xDE
    object          | *size*: 65536..4294967295         | map 32           | 0xDF
    binary          | *size*: 0..255                    | bin 8            | 0xC4
    binary          | *size*: 256..65535                | bin 16           | 0xC5
    binary          | *size*: 65536..4294967295         | bin 32           | 0xC6

    @note The mapping is **complete** in the sense that any JSON value type
          can be converted to a MessagePack value.

    @note The following values can **not** be converted to a MessagePack value:
          - strings with more than 4294967295 bytes
          - byte strings with more than 4294967295 bytes
          - arrays with more than 4294967295 elements
          - objects with more than 4294967295 elements

    @note Any MessagePack output created @ref to_msgpack can be successfully
          parsed by @ref from_msgpack.

    @note If NaN or Infinity are stored inside a JSON number, they are
          serialized properly. This behavior differs from the @ref dump()
          function which serializes NaN or Infinity to `null`.

    @param[in] j  JSON value to serialize
    @return MessagePack serialization as byte vector

    @complexity Linear in the size of the JSON value @a j.

    @liveexample{The example shows the serialization of a JSON value to a byte
    vector in MessagePack format.,to_msgpack}

    @sa http://msgpack.org
    @sa see @ref from_msgpack for the analogous deserialization
    @sa see @ref to_cbor(const basic_json& for the related CBOR format
    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
             related UBJSON format

    @since version 2.0.9
    */
    static std::vector<uint8_t> to_msgpack(const basic_json& j)
    {
        std::vector<uint8_t> result;
        to_msgpack(j, result);
        return result;
    }

    static void to_msgpack(const basic_json& j, detail::output_adapter<uint8_t> o)
    {
        binary_writer<uint8_t>(o).write_msgpack(j);
    }

    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_msgpack(j);
    }

    /*!
    @brief create a UBJSON serialization of a given JSON value

    Serializes a given JSON value @a j to a byte vector using the UBJSON
    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
    than JSON itself, yet more efficient to parse.

    The library uses the following mapping from JSON values types to
    UBJSON types according to the UBJSON specification:

    JSON value type | value/range                       | UBJSON type | marker
    --------------- | --------------------------------- | ----------- | ------
    null            | `null`                            | null        | `Z`
    boolean         | `true`                            | true        | `T`
    boolean         | `false`                           | false       | `F`
    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
    number_integer  | -2147483648..-32769               | int32       | `l`
    number_integer  | -32768..-129                      | int16       | `I`
    number_integer  | -128..127                         | int8        | `i`
    number_integer  | 128..255                          | uint8       | `U`
    number_integer  | 256..32767                        | int16       | `I`
    number_integer  | 32768..2147483647                 | int32       | `l`
    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
    number_unsigned | 0..127                            | int8        | `i`
    number_unsigned | 128..255                          | uint8       | `U`
    number_unsigned | 256..32767                        | int16       | `I`
    number_unsigned | 32768..2147483647                 | int32       | `l`
    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
    number_unsigned | 2147483649..18446744073709551615  | high-precision | `H`
    number_float    | *any value*                       | float64     | `D`
    string          | *with shortest length indicator*  | string      | `S`
    array           | *see notes on optimized format*   | array       | `[`
    object          | *see notes on optimized format*   | map         | `{`

    @note The mapping is **complete** in the sense that any JSON value type
          can be converted to a UBJSON value.

    @note The following values can **not** be converted to a UBJSON value:
          - strings with more than 9223372036854775807 bytes (theoretical)

    @note The following markers are not used in the conversion:
          - `Z`: no-op values are not created.
          - `C`: single-byte strings are serialized with `S` markers.

    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
          by @ref from_ubjson.

    @note If NaN or Infinity are stored inside a JSON number, they are
          serialized properly. This behavior differs from the @ref dump()
          function which serializes NaN or Infinity to `null`.

    @note The optimized formats for containers are supported: Parameter
          @a use_size adds size information to the beginning of a container and
          removes the closing marker. Parameter @a use_type further checks
          whether all elements of a container have the same type and adds the
          type marker to the beginning of the container. The @a use_type
          parameter must only be used together with @a use_size = true. Note
          that @a use_size = true alone may result in larger representations -
          the benefit of this parameter is that the receiving side is
          immediately informed on the number of elements of the container.

    @note If the JSON data contains the binary type, the value stored is a list
          of integers, as suggested by the UBJSON documentation.  In particular,
          this means that serialization and the deserialization of a JSON
          containing binary values into UBJSON and back will result in a
          different JSON object.

    @param[in] j  JSON value to serialize
    @param[in] use_size  whether to add size annotations to container types
    @param[in] use_type  whether to add type annotations to container types
                         (must be combined with @a use_size = true)
    @return UBJSON serialization as byte vector

    @complexity Linear in the size of the JSON value @a j.

    @liveexample{The example shows the serialization of a JSON value to a byte
    vector in UBJSON format.,to_ubjson}

    @sa http://ubjson.org
    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
        analogous deserialization
    @sa see @ref to_cbor(const basic_json& for the related CBOR format
    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format

    @since version 3.1.0
    */
    static std::vector<uint8_t> to_ubjson(const basic_json& j,
                                          const bool use_size = false,
                                          const bool use_type = false)
    {
        std::vector<uint8_t> result;
        to_ubjson(j, result, use_size, use_type);
        return result;
    }

    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
    }

    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
                          const bool use_size = false, const bool use_type = false)
    {
        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
    }


    /*!
    @brief Serializes the given JSON object `j` to BSON and returns a vector
           containing the corresponding BSON-representation.

    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
    stored as a single entity (a so-called document).

    The library uses the following mapping from JSON values types to BSON types:

    JSON value type | value/range                       | BSON type   | marker
    --------------- | --------------------------------- | ----------- | ------
    null            | `null`                            | null        | 0x0A
    boolean         | `true`, `false`                   | boolean     | 0x08
    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
    number_integer  | -2147483648..2147483647           | int32       | 0x10
    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
    number_unsigned | 0..2147483647                     | int32       | 0x10
    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
    number_float    | *any value*                       | double      | 0x01
    string          | *any value*                       | string      | 0x02
    array           | *any value*                       | document    | 0x04
    object          | *any value*                       | document    | 0x03
    binary          | *any value*                       | binary      | 0x05

    @warning The mapping is **incomplete**, since only JSON-objects (and things
    contained therein) can be serialized to BSON.
    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
    and the keys may not contain U+0000, since they are serialized a
    zero-terminated c-strings.

    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
    @throw type_error.317    if `!j.is_object()`

    @pre The input `j` is required to be an object: `j.is_object() == true`.

    @note Any BSON output created via @ref to_bson can be successfully parsed
          by @ref from_bson.

    @param[in] j  JSON value to serialize
    @return BSON serialization as byte vector

    @complexity Linear in the size of the JSON value @a j.

    @liveexample{The example shows the serialization of a JSON value to a byte
    vector in BSON format.,to_bson}

    @sa http://bsonspec.org/spec.html
    @sa see @ref from_bson(detail::input_adapter&&, const bool strict) for the
        analogous deserialization
    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
             related UBJSON format
    @sa see @ref to_cbor(const basic_json&) for the related CBOR format
    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format
    */
    static std::vector<uint8_t> to_bson(const basic_json& j)
    {
        std::vector<uint8_t> result;
        to_bson(j, result);
        return result;
    }

    /*!
    @brief Serializes the given JSON object `j` to BSON and forwards the
           corresponding BSON-representation to the given output_adapter `o`.
    @param j The JSON object to convert to BSON.
    @param o The output adapter that receives the binary BSON representation.
    @pre The input `j` shall be an object: `j.is_object() == true`
    @sa see @ref to_bson(const basic_json&)
    */
    static void to_bson(const basic_json& j, detail::output_adapter<uint8_t> o)
    {
        binary_writer<uint8_t>(o).write_bson(j);
    }

    /*!
    @copydoc to_bson(const basic_json&, detail::output_adapter<uint8_t>)
    */
    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
    {
        binary_writer<char>(o).write_bson(j);
    }


    /*!
    @brief create a JSON value from an input in CBOR format

    Deserializes a given input @a i to a JSON value using the CBOR (Concise
    Binary Object Representation) serialization format.

    The library maps CBOR types to JSON value types as follows:

    CBOR type              | JSON value type | first byte
    ---------------------- | --------------- | ----------
    Integer                | number_unsigned | 0x00..0x17
    Unsigned integer       | number_unsigned | 0x18
    Unsigned integer       | number_unsigned | 0x19
    Unsigned integer       | number_unsigned | 0x1A
    Unsigned integer       | number_unsigned | 0x1B
    Negative integer       | number_integer  | 0x20..0x37
    Negative integer       | number_integer  | 0x38
    Negative integer       | number_integer  | 0x39
    Negative integer       | number_integer  | 0x3A
    Negative integer       | number_integer  | 0x3B
    Byte string            | binary          | 0x40..0x57
    Byte string            | binary          | 0x58
    Byte string            | binary          | 0x59
    Byte string            | binary          | 0x5A
    Byte string            | binary          | 0x5B
    UTF-8 string           | string          | 0x60..0x77
    UTF-8 string           | string          | 0x78
    UTF-8 string           | string          | 0x79
    UTF-8 string           | string          | 0x7A
    UTF-8 string           | string          | 0x7B
    UTF-8 string           | string          | 0x7F
    array                  | array           | 0x80..0x97
    array                  | array           | 0x98
    array                  | array           | 0x99
    array                  | array           | 0x9A
    array                  | array           | 0x9B
    array                  | array           | 0x9F
    map                    | object          | 0xA0..0xB7
    map                    | object          | 0xB8
    map                    | object          | 0xB9
    map                    | object          | 0xBA
    map                    | object          | 0xBB
    map                    | object          | 0xBF
    False                  | `false`         | 0xF4
    True                   | `true`          | 0xF5
    Null                   | `null`          | 0xF6
    Half-Precision Float   | number_float    | 0xF9
    Single-Precision Float | number_float    | 0xFA
    Double-Precision Float | number_float    | 0xFB

    @warning The mapping is **incomplete** in the sense that not all CBOR
             types can be converted to a JSON value. The following CBOR types
             are not supported and will yield parse errors (parse_error.112):
             - date/time (0xC0..0xC1)
             - bignum (0xC2..0xC3)
             - decimal fraction (0xC4)
             - bigfloat (0xC5)
             - expected conversions (0xD5..0xD7)
             - simple values (0xE0..0xF3, 0xF8)
             - undefined (0xF7)

    @warning CBOR allows map keys of any type, whereas JSON only allows
             strings as keys in object values. Therefore, CBOR maps with keys
             other than UTF-8 strings are rejected (parse_error.113).

    @note Any CBOR output created @ref to_cbor can be successfully parsed by
          @ref from_cbor.

    @param[in] i  an input in CBOR format convertible to an input adapter
    @param[in] strict  whether to expect the input to be consumed until EOF
                       (true by default)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)
    @param[in] tag_handler how to treat CBOR tags (optional, error by default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.110 if the given input ends prematurely or the end of
    file was not reached when @a strict was set to true
    @throw parse_error.112 if unsupported features from CBOR were
    used in the given input @a v or if the input is not valid CBOR
    @throw parse_error.113 if a string was expected as map key, but not found

    @complexity Linear in the size of the input @a i.

    @liveexample{The example shows the deserialization of a byte vector in CBOR
    format to a JSON value.,from_cbor}

    @sa http://cbor.io
    @sa see @ref to_cbor(const basic_json&) for the analogous serialization
    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for the
        related MessagePack format
    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
        related UBJSON format

    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
           consume input adapters, removed start_index parameter, and added
           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
           since 3.2.0; added @a tag_handler parameter since 3.9.0.
    */
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_cbor(InputType&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    /*!
    @copydoc from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t)
    */
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_cbor(IteratorType first, IteratorType last,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
    static basic_json from_cbor(const T* ptr, std::size_t len,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
    }


    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
    static basic_json from_cbor(detail::span_input_adapter&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true,
                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
        return res ? result : basic_json(value_t::discarded);
    }

    /*!
    @brief create a JSON value from an input in MessagePack format

    Deserializes a given input @a i to a JSON value using the MessagePack
    serialization format.

    The library maps MessagePack types to JSON value types as follows:

    MessagePack type | JSON value type | first byte
    ---------------- | --------------- | ----------
    positive fixint  | number_unsigned | 0x00..0x7F
    fixmap           | object          | 0x80..0x8F
    fixarray         | array           | 0x90..0x9F
    fixstr           | string          | 0xA0..0xBF
    nil              | `null`          | 0xC0
    false            | `false`         | 0xC2
    true             | `true`          | 0xC3
    float 32         | number_float    | 0xCA
    float 64         | number_float    | 0xCB
    uint 8           | number_unsigned | 0xCC
    uint 16          | number_unsigned | 0xCD
    uint 32          | number_unsigned | 0xCE
    uint 64          | number_unsigned | 0xCF
    int 8            | number_integer  | 0xD0
    int 16           | number_integer  | 0xD1
    int 32           | number_integer  | 0xD2
    int 64           | number_integer  | 0xD3
    str 8            | string          | 0xD9
    str 16           | string          | 0xDA
    str 32           | string          | 0xDB
    array 16         | array           | 0xDC
    array 32         | array           | 0xDD
    map 16           | object          | 0xDE
    map 32           | object          | 0xDF
    bin 8            | binary          | 0xC4
    bin 16           | binary          | 0xC5
    bin 32           | binary          | 0xC6
    ext 8            | binary          | 0xC7
    ext 16           | binary          | 0xC8
    ext 32           | binary          | 0xC9
    fixext 1         | binary          | 0xD4
    fixext 2         | binary          | 0xD5
    fixext 4         | binary          | 0xD6
    fixext 8         | binary          | 0xD7
    fixext 16        | binary          | 0xD8
    negative fixint  | number_integer  | 0xE0-0xFF

    @note Any MessagePack output created @ref to_msgpack can be successfully
          parsed by @ref from_msgpack.

    @param[in] i  an input in MessagePack format convertible to an input
                  adapter
    @param[in] strict  whether to expect the input to be consumed until EOF
                       (true by default)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.110 if the given input ends prematurely or the end of
    file was not reached when @a strict was set to true
    @throw parse_error.112 if unsupported features from MessagePack were
    used in the given input @a i or if the input is not valid MessagePack
    @throw parse_error.113 if a string was expected as map key, but not found

    @complexity Linear in the size of the input @a i.

    @liveexample{The example shows the deserialization of a byte vector in
    MessagePack format to a JSON value.,from_msgpack}

    @sa http://msgpack.org
    @sa see @ref to_msgpack(const basic_json&) for the analogous serialization
    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
        related CBOR format
    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for
        the related UBJSON format
    @sa see @ref from_bson(InputType&&, const bool, const bool) for
        the related BSON format

    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
           consume input adapters, removed start_index parameter, and added
           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
           since 3.2.0
    */
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_msgpack(InputType&& i,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /*!
    @copydoc from_msgpack(InputType&&, const bool, const bool)
    */
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_msgpack(IteratorType first, IteratorType last,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }


    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
    static basic_json from_msgpack(const T* ptr, std::size_t len,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
    static basic_json from_msgpack(detail::span_input_adapter&& i,
                                   const bool strict = true,
                                   const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }


    /*!
    @brief create a JSON value from an input in UBJSON format

    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
    Binary JSON) serialization format.

    The library maps UBJSON types to JSON value types as follows:

    UBJSON type | JSON value type                         | marker
    ----------- | --------------------------------------- | ------
    no-op       | *no value, next value is read*          | `N`
    null        | `null`                                  | `Z`
    false       | `false`                                 | `F`
    true        | `true`                                  | `T`
    float32     | number_float                            | `d`
    float64     | number_float                            | `D`
    uint8       | number_unsigned                         | `U`
    int8        | number_integer                          | `i`
    int16       | number_integer                          | `I`
    int32       | number_integer                          | `l`
    int64       | number_integer                          | `L`
    high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H'
    string      | string                                  | `S`
    char        | string                                  | `C`
    array       | array (optimized values are supported)  | `[`
    object      | object (optimized values are supported) | `{`

    @note The mapping is **complete** in the sense that any UBJSON value can
          be converted to a JSON value.

    @param[in] i  an input in UBJSON format convertible to an input adapter
    @param[in] strict  whether to expect the input to be consumed until EOF
                       (true by default)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.110 if the given input ends prematurely or the end of
    file was not reached when @a strict was set to true
    @throw parse_error.112 if a parse error occurs
    @throw parse_error.113 if a string could not be parsed successfully

    @complexity Linear in the size of the input @a i.

    @liveexample{The example shows the deserialization of a byte vector in
    UBJSON format to a JSON value.,from_ubjson}

    @sa http://ubjson.org
    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
             analogous serialization
    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
        related CBOR format
    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for
        the related MessagePack format
    @sa see @ref from_bson(InputType&&, const bool, const bool) for
        the related BSON format

    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
    */
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_ubjson(InputType&& i,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /*!
    @copydoc from_ubjson(InputType&&, const bool, const bool)
    */
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_ubjson(IteratorType first, IteratorType last,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
    static basic_json from_ubjson(const T* ptr, std::size_t len,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
    static basic_json from_ubjson(detail::span_input_adapter&& i,
                                  const bool strict = true,
                                  const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }


    /*!
    @brief Create a JSON value from an input in BSON format

    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
    serialization format.

    The library maps BSON record types to JSON value types as follows:

    BSON type       | BSON marker byte | JSON value type
    --------------- | ---------------- | ---------------------------
    double          | 0x01             | number_float
    string          | 0x02             | string
    document        | 0x03             | object
    array           | 0x04             | array
    binary          | 0x05             | binary
    undefined       | 0x06             | still unsupported
    ObjectId        | 0x07             | still unsupported
    boolean         | 0x08             | boolean
    UTC Date-Time   | 0x09             | still unsupported
    null            | 0x0A             | null
    Regular Expr.   | 0x0B             | still unsupported
    DB Pointer      | 0x0C             | still unsupported
    JavaScript Code | 0x0D             | still unsupported
    Symbol          | 0x0E             | still unsupported
    JavaScript Code | 0x0F             | still unsupported
    int32           | 0x10             | number_integer
    Timestamp       | 0x11             | still unsupported
    128-bit decimal float | 0x13       | still unsupported
    Max Key         | 0x7F             | still unsupported
    Min Key         | 0xFF             | still unsupported

    @warning The mapping is **incomplete**. The unsupported mappings
             are indicated in the table above.

    @param[in] i  an input in BSON format convertible to an input adapter
    @param[in] strict  whether to expect the input to be consumed until EOF
                       (true by default)
    @param[in] allow_exceptions  whether to throw exceptions in case of a
    parse error (optional, true by default)

    @return deserialized JSON value; in case of a parse error and
            @a allow_exceptions set to `false`, the return value will be
            value_t::discarded.

    @throw parse_error.114 if an unsupported BSON record type is encountered

    @complexity Linear in the size of the input @a i.

    @liveexample{The example shows the deserialization of a byte vector in
    BSON format to a JSON value.,from_bson}

    @sa http://bsonspec.org/spec.html
    @sa see @ref to_bson(const basic_json&) for the analogous serialization
    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
        related CBOR format
    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for
        the related MessagePack format
    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
        related UBJSON format
    */
    template<typename InputType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bson(InputType&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::forward<InputType>(i));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    /*!
    @copydoc from_bson(InputType&&, const bool, const bool)
    */
    template<typename IteratorType>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json from_bson(IteratorType first, IteratorType last,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = detail::input_adapter(std::move(first), std::move(last));
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }

    template<typename T>
    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
    static basic_json from_bson(const T* ptr, std::size_t len,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        return from_bson(ptr, ptr + len, strict, allow_exceptions);
    }

    JSON_HEDLEY_WARN_UNUSED_RESULT
    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
    static basic_json from_bson(detail::span_input_adapter&& i,
                                const bool strict = true,
                                const bool allow_exceptions = true)
    {
        basic_json result;
        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
        auto ia = i.get();
        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
        return res ? result : basic_json(value_t::discarded);
    }
    /// @}

    //////////////////////////
    // JSON Pointer support //
    //////////////////////////

    /// @name JSON Pointer functions
    /// @{

    /*!
    @brief access specified element via JSON Pointer

    Uses a JSON pointer to retrieve a reference to the respective JSON value.
    No bound checking is performed. Similar to @ref operator[](const typename
    object_t::key_type&), `null` values are created in arrays and objects if
    necessary.

    In particular:
    - If the JSON pointer points to an object key that does not exist, it
      is created an filled with a `null` value before a reference to it
      is returned.
    - If the JSON pointer points to an array index that does not exist, it
      is created an filled with a `null` value before a reference to it
      is returned. All indices between the current maximum and the given
      index are also filled with `null`.
    - The special value `-` is treated as a synonym for the index past the
      end.

    @param[in] ptr  a JSON pointer

    @return reference to the element pointed to by @a ptr

    @complexity Constant.

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.404  if the JSON pointer can not be resolved

    @liveexample{The behavior is shown in the example.,operatorjson_pointer}

    @since version 2.0.0
    */
    reference operator[](const json_pointer& ptr)
    {
        return ptr.get_unchecked(this);
    }

    /*!
    @brief access specified element via JSON Pointer

    Uses a JSON pointer to retrieve a reference to the respective JSON value.
    No bound checking is performed. The function does not change the JSON
    value; no `null` values are created. In particular, the special value
    `-` yields an exception.

    @param[in] ptr  JSON pointer to the desired element

    @return const reference to the element pointed to by @a ptr

    @complexity Constant.

    @throw parse_error.106   if an array index begins with '0'
    @throw parse_error.109   if an array index was not a number
    @throw out_of_range.402  if the array index '-' is used
    @throw out_of_range.404  if the JSON pointer can not be resolved

    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}

    @since version 2.0.0
    */
    const_reference operator[](const json_pointer& ptr) const
    {
        return ptr.get_unchecked(this);
    }

    /*!
    @brief access specified element via JSON Pointer

    Returns a reference to the element at with specified JSON pointer @a ptr,
    with bounds checking.

    @param[in] ptr  JSON pointer to the desired element

    @return reference to the element pointed to by @a ptr

    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
    begins with '0'. See example below.

    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
    is not a number. See example below.

    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
    is out of range. See example below.

    @throw out_of_range.402 if the array index '-' is used in the passed JSON
    pointer @a ptr. As `at` provides checked access (and no elements are
    implicitly inserted), the index '-' is always invalid. See example below.

    @throw out_of_range.403 if the JSON pointer describes a key of an object
    which cannot be found. See example below.

    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
    See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @since version 2.0.0

    @liveexample{The behavior is shown in the example.,at_json_pointer}
    */
    reference at(const json_pointer& ptr)
    {
        return ptr.get_checked(this);
    }

    /*!
    @brief access specified element via JSON Pointer

    Returns a const reference to the element at with specified JSON pointer @a
    ptr, with bounds checking.

    @param[in] ptr  JSON pointer to the desired element

    @return reference to the element pointed to by @a ptr

    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
    begins with '0'. See example below.

    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
    is not a number. See example below.

    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
    is out of range. See example below.

    @throw out_of_range.402 if the array index '-' is used in the passed JSON
    pointer @a ptr. As `at` provides checked access (and no elements are
    implicitly inserted), the index '-' is always invalid. See example below.

    @throw out_of_range.403 if the JSON pointer describes a key of an object
    which cannot be found. See example below.

    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
    See example below.

    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
    changes in the JSON value.

    @complexity Constant.

    @since version 2.0.0

    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
    */
    const_reference at(const json_pointer& ptr) const
    {
        return ptr.get_checked(this);
    }

    /*!
    @brief return flattened JSON value

    The function creates a JSON object whose keys are JSON pointers (see [RFC
    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
    primitive. The original JSON value can be restored using the @ref
    unflatten() function.

    @return an object that maps JSON pointers to primitive values

    @note Empty objects and arrays are flattened to `null` and will not be
          reconstructed correctly by the @ref unflatten() function.

    @complexity Linear in the size the JSON value.

    @liveexample{The following code shows how a JSON object is flattened to an
    object whose keys consist of JSON pointers.,flatten}

    @sa see @ref unflatten() for the reverse function

    @since version 2.0.0
    */
    basic_json flatten() const
    {
        basic_json result(value_t::object);
        json_pointer::flatten("", *this, result);
        return result;
    }

    /*!
    @brief unflatten a previously flattened JSON value

    The function restores the arbitrary nesting of a JSON value that has been
    flattened before using the @ref flatten() function. The JSON value must
    meet certain constraints:
    1. The value must be an object.
    2. The keys must be JSON pointers (see
       [RFC 6901](https://tools.ietf.org/html/rfc6901))
    3. The mapped values must be primitive JSON types.

    @return the original JSON from a flattened version

    @note Empty objects and arrays are flattened by @ref flatten() to `null`
          values and can not unflattened to their original type. Apart from
          this example, for a JSON value `j`, the following is always true:
          `j == j.flatten().unflatten()`.

    @complexity Linear in the size the JSON value.

    @throw type_error.314  if value is not an object
    @throw type_error.315  if object values are not primitive

    @liveexample{The following code shows how a flattened JSON object is
    unflattened into the original nested JSON object.,unflatten}

    @sa see @ref flatten() for the reverse function

    @since version 2.0.0
    */
    basic_json unflatten() const
    {
        return json_pointer::unflatten(*this);
    }

    /// @}

    //////////////////////////
    // JSON Patch functions //
    //////////////////////////

    /// @name JSON Patch functions
    /// @{

    /*!
    @brief applies a JSON patch

    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
    expressing a sequence of operations to apply to a JSON) document. With
    this function, a JSON Patch is applied to the current JSON value by
    executing all operations from the patch.

    @param[in] json_patch  JSON patch document
    @return patched document

    @note The application of a patch is atomic: Either all operations succeed
          and the patched document is returned or an exception is thrown. In
          any case, the original value is not changed: the patch is applied
          to a copy of the value.

    @throw parse_error.104 if the JSON patch does not consist of an array of
    objects

    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
    attributes are missing); example: `"operation add must have member path"`

    @throw out_of_range.401 if an array index is out of range.

    @throw out_of_range.403 if a JSON pointer inside the patch could not be
    resolved successfully in the current JSON value; example: `"key baz not
    found"`

    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
    "move")

    @throw other_error.501 if "test" operation was unsuccessful

    @complexity Linear in the size of the JSON value and the length of the
    JSON patch. As usually only a fraction of the JSON value is affected by
    the patch, the complexity can usually be neglected.

    @liveexample{The following code shows how a JSON patch is applied to a
    value.,patch}

    @sa see @ref diff -- create a JSON patch by comparing two JSON values

    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)

    @since version 2.0.0
    */
    basic_json patch(const basic_json& json_patch) const
    {
        // make a working copy to apply the patch to
        basic_json result = *this;

        // the valid JSON Patch operations
        enum class patch_operations {add, remove, replace, move, copy, test, invalid};

        const auto get_op = [](const std::string & op)
        {
            if (op == "add")
            {
                return patch_operations::add;
            }
            if (op == "remove")
            {
                return patch_operations::remove;
            }
            if (op == "replace")
            {
                return patch_operations::replace;
            }
            if (op == "move")
            {
                return patch_operations::move;
            }
            if (op == "copy")
            {
                return patch_operations::copy;
            }
            if (op == "test")
            {
                return patch_operations::test;
            }

            return patch_operations::invalid;
        };

        // wrapper for "add" operation; add value at ptr
        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
        {
            // adding to the root of the target document means replacing it
            if (ptr.empty())
            {
                result = val;
                return;
            }

            // make sure the top element of the pointer exists
            json_pointer top_pointer = ptr.top();
            if (top_pointer != ptr)
            {
                result.at(top_pointer);
            }

            // get reference to parent of JSON pointer ptr
            const auto last_path = ptr.back();
            ptr.pop_back();
            basic_json& parent = result[ptr];

            switch (parent.m_type)
            {
                case value_t::null:
                case value_t::object:
                {
                    // use operator[] to add value
                    parent[last_path] = val;
                    break;
                }

                case value_t::array:
                {
                    if (last_path == "-")
                    {
                        // special case: append to back
                        parent.push_back(val);
                    }
                    else
                    {
                        const auto idx = json_pointer::array_index(last_path);
                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
                        {
                            // avoid undefined behavior
                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", parent));
                        }

                        // default case: insert add offset
                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
                    }
                    break;
                }

                // if there exists a parent it cannot be primitive
                default:            // LCOV_EXCL_LINE
                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
            }
        };

        // wrapper for "remove" operation; remove value at ptr
        const auto operation_remove = [this, &result](json_pointer & ptr)
        {
            // get reference to parent of JSON pointer ptr
            const auto last_path = ptr.back();
            ptr.pop_back();
            basic_json& parent = result.at(ptr);

            // remove child
            if (parent.is_object())
            {
                // perform range check
                auto it = parent.find(last_path);
                if (JSON_HEDLEY_LIKELY(it != parent.end()))
                {
                    parent.erase(it);
                }
                else
                {
                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found", *this));
                }
            }
            else if (parent.is_array())
            {
                // note erase performs range check
                parent.erase(json_pointer::array_index(last_path));
            }
        };

        // type check: top level value must be an array
        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
        {
            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", json_patch));
        }

        // iterate and apply the operations
        for (const auto& val : json_patch)
        {
            // wrapper to get a value for an operation
            const auto get_value = [&val](const std::string & op,
                                          const std::string & member,
                                          bool string_type) -> basic_json &
            {
                // find value
                auto it = val.m_value.object->find(member);

                // context-sensitive error message
                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";

                // check if desired value is present
                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
                {
                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'", val));
                }

                // check if result is of type string
                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
                {
                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'", val));
                }

                // no error: return value
                return it->second;
            };

            // type check: every element of the array must be an object
            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
            {
                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", val));
            }

            // collect mandatory members
            const auto op = get_value("op", "op", true).template get<std::string>();
            const auto path = get_value(op, "path", true).template get<std::string>();
            json_pointer ptr(path);

            switch (get_op(op))
            {
                case patch_operations::add:
                {
                    operation_add(ptr, get_value("add", "value", false));
                    break;
                }

                case patch_operations::remove:
                {
                    operation_remove(ptr);
                    break;
                }

                case patch_operations::replace:
                {
                    // the "path" location must exist - use at()
                    result.at(ptr) = get_value("replace", "value", false);
                    break;
                }

                case patch_operations::move:
                {
                    const auto from_path = get_value("move", "from", true).template get<std::string>();
                    json_pointer from_ptr(from_path);

                    // the "from" location must exist - use at()
                    basic_json v = result.at(from_ptr);

                    // The move operation is functionally identical to a
                    // "remove" operation on the "from" location, followed
                    // immediately by an "add" operation at the target
                    // location with the value that was just removed.
                    operation_remove(from_ptr);
                    operation_add(ptr, v);
                    break;
                }

                case patch_operations::copy:
                {
                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
                    const json_pointer from_ptr(from_path);

                    // the "from" location must exist - use at()
                    basic_json v = result.at(from_ptr);

                    // The copy is functionally identical to an "add"
                    // operation at the target location using the value
                    // specified in the "from" member.
                    operation_add(ptr, v);
                    break;
                }

                case patch_operations::test:
                {
                    bool success = false;
                    JSON_TRY
                    {
                        // check if "value" matches the one at "path"
                        // the "path" location must exist - use at()
                        success = (result.at(ptr) == get_value("test", "value", false));
                    }
                    JSON_INTERNAL_CATCH (out_of_range&)
                    {
                        // ignore out of range errors: success remains false
                    }

                    // throw an exception if test fails
                    if (JSON_HEDLEY_UNLIKELY(!success))
                    {
                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump(), val));
                    }

                    break;
                }

                default:
                {
                    // op must be "add", "remove", "replace", "move", "copy", or
                    // "test"
                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid", val));
                }
            }
        }

        return result;
    }

    /*!
    @brief creates a diff as a JSON patch

    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
    be changed into the value @a target by calling @ref patch function.

    @invariant For two JSON values @a source and @a target, the following code
    yields always `true`:
    @code {.cpp}
    source.patch(diff(source, target)) == target;
    @endcode

    @note Currently, only `remove`, `add`, and `replace` operations are
          generated.

    @param[in] source  JSON value to compare from
    @param[in] target  JSON value to compare against
    @param[in] path    helper value to create JSON pointers

    @return a JSON patch to convert the @a source to @a target

    @complexity Linear in the lengths of @a source and @a target.

    @liveexample{The following code shows how a JSON patch is created as a
    diff for two JSON values.,diff}

    @sa see @ref patch -- apply a JSON patch
    @sa see @ref merge_patch -- apply a JSON Merge Patch

    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)

    @since version 2.0.0
    */
    JSON_HEDLEY_WARN_UNUSED_RESULT
    static basic_json diff(const basic_json& source, const basic_json& target,
                           const std::string& path = "")
    {
        // the patch
        basic_json result(value_t::array);

        // if the values are the same, return empty patch
        if (source == target)
        {
            return result;
        }

        if (source.type() != target.type())
        {
            // different types: replace value
            result.push_back(
            {
                {"op", "replace"}, {"path", path}, {"value", target}
            });
            return result;
        }

        switch (source.type())
        {
            case value_t::array:
            {
                // first pass: traverse common elements
                std::size_t i = 0;
                while (i < source.size() && i < target.size())
                {
                    // recursive call to compare array values at index i
                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
                    ++i;
                }

                // i now reached the end of at least one array
                // in a second pass, traverse the remaining elements

                // remove my remaining elements
                const auto end_index = static_cast<difference_type>(result.size());
                while (i < source.size())
                {
                    // add operations in reverse order to avoid invalid
                    // indices
                    result.insert(result.begin() + end_index, object(
                    {
                        {"op", "remove"},
                        {"path", path + "/" + std::to_string(i)}
                    }));
                    ++i;
                }

                // add other remaining elements
                while (i < target.size())
                {
                    result.push_back(
                    {
                        {"op", "add"},
                        {"path", path + "/-"},
                        {"value", target[i]}
                    });
                    ++i;
                }

                break;
            }

            case value_t::object:
            {
                // first pass: traverse this object's elements
                for (auto it = source.cbegin(); it != source.cend(); ++it)
                {
                    // escape the key name to be used in a JSON patch
                    const auto path_key = path + "/" + detail::escape(it.key());

                    if (target.find(it.key()) != target.end())
                    {
                        // recursive call to compare object values at key it
                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
                    }
                    else
                    {
                        // found a key that is not in o -> remove it
                        result.push_back(object(
                        {
                            {"op", "remove"}, {"path", path_key}
                        }));
                    }
                }

                // second pass: traverse other object's elements
                for (auto it = target.cbegin(); it != target.cend(); ++it)
                {
                    if (source.find(it.key()) == source.end())
                    {
                        // found a key that is not in this -> add it
                        const auto path_key = path + "/" + detail::escape(it.key());
                        result.push_back(
                        {
                            {"op", "add"}, {"path", path_key},
                            {"value", it.value()}
                        });
                    }
                }

                break;
            }

            default:
            {
                // both primitive type: replace value
                result.push_back(
                {
                    {"op", "replace"}, {"path", path}, {"value", target}
                });
                break;
            }
        }

        return result;
    }

    /// @}

    ////////////////////////////////
    // JSON Merge Patch functions //
    ////////////////////////////////

    /// @name JSON Merge Patch functions
    /// @{

    /*!
    @brief applies a JSON Merge Patch

    The merge patch format is primarily intended for use with the HTTP PATCH
    method as a means of describing a set of modifications to a target
    resource's content. This function applies a merge patch to the current
    JSON value.

    The function implements the following algorithm from Section 2 of
    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):

    ```
    define MergePatch(Target, Patch):
      if Patch is an Object:
        if Target is not an Object:
          Target = {} // Ignore the contents and set it to an empty Object
        for each Name/Value pair in Patch:
          if Value is null:
            if Name exists in Target:
              remove the Name/Value pair from Target
          else:
            Target[Name] = MergePatch(Target[Name], Value)
        return Target
      else:
        return Patch
    ```

    Thereby, `Target` is the current object; that is, the patch is applied to
    the current value.

    @param[in] apply_patch  the patch to apply

    @complexity Linear in the lengths of @a patch.

    @liveexample{The following code shows how a JSON Merge Patch is applied to
    a JSON document.,merge_patch}

    @sa see @ref patch -- apply a JSON patch
    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)

    @since version 3.0.0
    */
    void merge_patch(const basic_json& apply_patch)
    {
        if (apply_patch.is_object())
        {
            if (!is_object())
            {
                *this = object();
            }
            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
            {
                if (it.value().is_null())
                {
                    erase(it.key());
                }
                else
                {
                    operator[](it.key()).merge_patch(it.value());
                }
            }
        }
        else
        {
            *this = apply_patch;
        }
    }

    /// @}
};

/*!
@brief user-defined to_string function for JSON values

This function implements a user-defined to_string  for JSON objects.

@param[in] j  a JSON object
@return a std::string object
*/

NLOHMANN_BASIC_JSON_TPL_DECLARATION
std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
{
    return j.dump();
}
} // namespace nlohmann

///////////////////////
// nonmember support //
///////////////////////

// specialization of std::swap, and std::hash
namespace std
{

/// hash value for JSON objects
template<>
struct hash<nlohmann::json>
{
    /*!
    @brief return a hash value for a JSON object

    @since version 1.0.0
    */
    std::size_t operator()(const nlohmann::json& j) const
    {
        return nlohmann::detail::hash(j);
    }
};

/// specialization for std::less<value_t>
/// @note: do not remove the space after '<',
///        see https://github.com/nlohmann/json/pull/679
template<>
struct less<::nlohmann::detail::value_t>
{
    /*!
    @brief compare two value_t enum values
    @since version 3.0.0
    */
    bool operator()(nlohmann::detail::value_t lhs,
                    nlohmann::detail::value_t rhs) const noexcept
    {
        return nlohmann::detail::operator<(lhs, rhs);
    }
};

// C++20 prohibit function specialization in the std namespace.
#ifndef JSON_HAS_CPP_20

/*!
@brief exchanges the values of two JSON objects

@since version 1.0.0
*/
template<>
inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name)
    is_nothrow_move_constructible<nlohmann::json>::value&&  // NOLINT(misc-redundant-expression)
    is_nothrow_move_assignable<nlohmann::json>::value
                              )
{
    j1.swap(j2);
}

#endif

} // namespace std

/*!
@brief user-defined string literal for JSON values

This operator implements a user-defined string literal for JSON objects. It
can be used by adding `"_json"` to a string literal and returns a JSON object
if no parse error occurred.

@param[in] s  a string representation of a JSON object
@param[in] n  the length of string @a s
@return a JSON object

@since version 1.0.0
*/
JSON_HEDLEY_NON_NULL(1)
inline nlohmann::json operator "" _json(const char* s, std::size_t n)
{
    return nlohmann::json::parse(s, s + n);
}

/*!
@brief user-defined string literal for JSON pointer

This operator implements a user-defined string literal for JSON Pointers. It
can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
object if no parse error occurred.

@param[in] s  a string representation of a JSON Pointer
@param[in] n  the length of string @a s
@return a JSON pointer object

@since version 2.0.0
*/
JSON_HEDLEY_NON_NULL(1)
inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
{
    return nlohmann::json::json_pointer(std::string(s, n));
}

// #include <nlohmann/detail/macro_unscope.hpp>


// restore GCC/clang diagnostic settings
#if defined(__clang__)
    #pragma GCC diagnostic pop
#endif

// clean up
#undef JSON_ASSERT
#undef JSON_INTERNAL_CATCH
#undef JSON_CATCH
#undef JSON_THROW
#undef JSON_TRY
#undef JSON_PRIVATE_UNLESS_TESTED
#undef JSON_HAS_CPP_14
#undef JSON_HAS_CPP_17
#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
#undef NLOHMANN_BASIC_JSON_TPL
#undef JSON_EXPLICIT

// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>


#undef JSON_HEDLEY_ALWAYS_INLINE
#undef JSON_HEDLEY_ARM_VERSION
#undef JSON_HEDLEY_ARM_VERSION_CHECK
#undef JSON_HEDLEY_ARRAY_PARAM
#undef JSON_HEDLEY_ASSUME
#undef JSON_HEDLEY_BEGIN_C_DECLS
#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
#undef JSON_HEDLEY_CLANG_HAS_FEATURE
#undef JSON_HEDLEY_CLANG_HAS_WARNING
#undef JSON_HEDLEY_COMPCERT_VERSION
#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
#undef JSON_HEDLEY_CONCAT
#undef JSON_HEDLEY_CONCAT3
#undef JSON_HEDLEY_CONCAT3_EX
#undef JSON_HEDLEY_CONCAT_EX
#undef JSON_HEDLEY_CONST
#undef JSON_HEDLEY_CONSTEXPR
#undef JSON_HEDLEY_CONST_CAST
#undef JSON_HEDLEY_CPP_CAST
#undef JSON_HEDLEY_CRAY_VERSION
#undef JSON_HEDLEY_CRAY_VERSION_CHECK
#undef JSON_HEDLEY_C_DECL
#undef JSON_HEDLEY_DEPRECATED
#undef JSON_HEDLEY_DEPRECATED_FOR
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
#undef JSON_HEDLEY_DIAGNOSTIC_POP
#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
#undef JSON_HEDLEY_DMC_VERSION
#undef JSON_HEDLEY_DMC_VERSION_CHECK
#undef JSON_HEDLEY_EMPTY_BASES
#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
#undef JSON_HEDLEY_END_C_DECLS
#undef JSON_HEDLEY_FLAGS
#undef JSON_HEDLEY_FLAGS_CAST
#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_BUILTIN
#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_GCC_HAS_EXTENSION
#undef JSON_HEDLEY_GCC_HAS_FEATURE
#undef JSON_HEDLEY_GCC_HAS_WARNING
#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
#undef JSON_HEDLEY_GCC_VERSION
#undef JSON_HEDLEY_GCC_VERSION_CHECK
#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
#undef JSON_HEDLEY_GNUC_HAS_FEATURE
#undef JSON_HEDLEY_GNUC_HAS_WARNING
#undef JSON_HEDLEY_GNUC_VERSION
#undef JSON_HEDLEY_GNUC_VERSION_CHECK
#undef JSON_HEDLEY_HAS_ATTRIBUTE
#undef JSON_HEDLEY_HAS_BUILTIN
#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
#undef JSON_HEDLEY_HAS_EXTENSION
#undef JSON_HEDLEY_HAS_FEATURE
#undef JSON_HEDLEY_HAS_WARNING
#undef JSON_HEDLEY_IAR_VERSION
#undef JSON_HEDLEY_IAR_VERSION_CHECK
#undef JSON_HEDLEY_IBM_VERSION
#undef JSON_HEDLEY_IBM_VERSION_CHECK
#undef JSON_HEDLEY_IMPORT
#undef JSON_HEDLEY_INLINE
#undef JSON_HEDLEY_INTEL_CL_VERSION
#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
#undef JSON_HEDLEY_INTEL_VERSION
#undef JSON_HEDLEY_INTEL_VERSION_CHECK
#undef JSON_HEDLEY_IS_CONSTANT
#undef JSON_HEDLEY_IS_CONSTEXPR_
#undef JSON_HEDLEY_LIKELY
#undef JSON_HEDLEY_MALLOC
#undef JSON_HEDLEY_MCST_LCC_VERSION
#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
#undef JSON_HEDLEY_MESSAGE
#undef JSON_HEDLEY_MSVC_VERSION
#undef JSON_HEDLEY_MSVC_VERSION_CHECK
#undef JSON_HEDLEY_NEVER_INLINE
#undef JSON_HEDLEY_NON_NULL
#undef JSON_HEDLEY_NO_ESCAPE
#undef JSON_HEDLEY_NO_RETURN
#undef JSON_HEDLEY_NO_THROW
#undef JSON_HEDLEY_NULL
#undef JSON_HEDLEY_PELLES_VERSION
#undef JSON_HEDLEY_PELLES_VERSION_CHECK
#undef JSON_HEDLEY_PGI_VERSION
#undef JSON_HEDLEY_PGI_VERSION_CHECK
#undef JSON_HEDLEY_PREDICT
#undef JSON_HEDLEY_PRINTF_FORMAT
#undef JSON_HEDLEY_PRIVATE
#undef JSON_HEDLEY_PUBLIC
#undef JSON_HEDLEY_PURE
#undef JSON_HEDLEY_REINTERPRET_CAST
#undef JSON_HEDLEY_REQUIRE
#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
#undef JSON_HEDLEY_REQUIRE_MSG
#undef JSON_HEDLEY_RESTRICT
#undef JSON_HEDLEY_RETURNS_NON_NULL
#undef JSON_HEDLEY_SENTINEL
#undef JSON_HEDLEY_STATIC_ASSERT
#undef JSON_HEDLEY_STATIC_CAST
#undef JSON_HEDLEY_STRINGIFY
#undef JSON_HEDLEY_STRINGIFY_EX
#undef JSON_HEDLEY_SUNPRO_VERSION
#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
#undef JSON_HEDLEY_TINYC_VERSION
#undef JSON_HEDLEY_TINYC_VERSION_CHECK
#undef JSON_HEDLEY_TI_ARMCL_VERSION
#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL2000_VERSION
#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL430_VERSION
#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL6X_VERSION
#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
#undef JSON_HEDLEY_TI_CL7X_VERSION
#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
#undef JSON_HEDLEY_TI_CLPRU_VERSION
#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
#undef JSON_HEDLEY_TI_VERSION
#undef JSON_HEDLEY_TI_VERSION_CHECK
#undef JSON_HEDLEY_UNAVAILABLE
#undef JSON_HEDLEY_UNLIKELY
#undef JSON_HEDLEY_UNPREDICTABLE
#undef JSON_HEDLEY_UNREACHABLE
#undef JSON_HEDLEY_UNREACHABLE_RETURN
#undef JSON_HEDLEY_VERSION
#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
#undef JSON_HEDLEY_VERSION_DECODE_MINOR
#undef JSON_HEDLEY_VERSION_DECODE_REVISION
#undef JSON_HEDLEY_VERSION_ENCODE
#undef JSON_HEDLEY_WARNING
#undef JSON_HEDLEY_WARN_UNUSED_RESULT
#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
#undef JSON_HEDLEY_FALL_THROUGH


#endif  // INCLUDE_NLOHMANN_JSON_HPP_


================================================
FILE: src/m4/ax_check_opencl.m4
================================================
# Check if OpenCL is available and that it supports a CPU device.
# The check for a CPU device is the same check that is performed
# by opencl_create_device in ocl_utilities.c
AC_DEFUN([AX_CHECK_OPENCL], [
	AC_SUBST(HAVE_OPENCL)
	HAVE_OPENCL=no
	AC_CHECK_HEADER([CL/opencl.h], [
		AC_CHECK_LIB([OpenCL], [clGetPlatformIDs], [
			SAVE_LIBS=$LIBS
			LIBS="$LIBS -lOpenCL"
			AC_MSG_CHECKING([for OpenCL CPU device])
			AC_RUN_IFELSE([AC_LANG_PROGRAM(
				[[#include <CL/opencl.h>]], [[
	cl_platform_id platform;
	cl_device_id dev;

	if (clGetPlatformIDs(1, &platform, NULL) < 0)
		return 1;
	if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL) < 0)
		return 1;
				]])], [HAVE_OPENCL=yes])
			AC_MSG_RESULT($HAVE_OPENCL)
			LIBS=$SAVE_LIBS
			])])
])


================================================
FILE: src/m4/ax_check_openmp.m4
================================================
# Check if $CC supports openmp.
AC_DEFUN([AX_CHECK_OPENMP], [
	AC_SUBST(HAVE_OPENMP)
	HAVE_OPENMP=no
	AC_MSG_CHECKING([for OpenMP support by $CC])
	echo | $CC -x c - -fsyntax-only -fopenmp -Werror >/dev/null 2>/dev/null
	if test $? -eq 0; then
		HAVE_OPENMP=yes
	fi
	AC_MSG_RESULT($HAVE_OPENMP)

	if test $HAVE_OPENMP = yes; then
		SAVE_CFLAGS=$CFLAGS
		CFLAGS="$CFLAGS -fopenmp"
		# Using some version of clang, the value of "m" becomes zero
		# after the parallel for loop.
		AC_RUN_IFELSE([AC_LANG_PROGRAM([[
		#include <stdlib.h>

		static void f(int m, double A[m])
		{
			#pragma omp parallel for
			for (int c0 = 0; c0 < m; c0 += 1)
				A[c0] = 0.;
			if (m != 100)
				abort();
		}
		]],[[
		double A[100];

		f(100, A);
		]])],[],[
			AC_MSG_NOTICE([OpenMP support broken, disabling])
			HAVE_OPENMP=no
		],[])
		CFLAGS=$SAVE_CFLAGS
	fi
])


================================================
FILE: src/m4/ax_detect_git_head.m4
================================================
AC_DEFUN([AX_DETECT_GIT_HEAD], [
	AC_SUBST(GIT_HEAD_ID)
	AC_SUBST(GIT_HEAD)
	AC_SUBST(GIT_HEAD_VERSION)
	if test -f $srcdir/.git; then
		gitdir=`GIT_DIR=$srcdir/.git git rev-parse --git-dir`
		GIT_HEAD="$gitdir/index"
		GIT_REPO="$gitdir"
		GIT_HEAD_ID=`GIT_DIR=$GIT_REPO git describe --always`
	elif test -f $srcdir/.git/HEAD; then
		GIT_HEAD="$srcdir/.git/index"
		GIT_REPO="$srcdir/.git"
		GIT_HEAD_ID=`GIT_DIR=$GIT_REPO git describe --always`
	elif test -f $srcdir/GIT_HEAD_ID; then
		GIT_HEAD_ID=`cat $srcdir/GIT_HEAD_ID`
	else
		mysrcdir=`(cd $srcdir; pwd)`
		head=`basename $mysrcdir | sed -e 's/.*-//'`
		head2=`echo $head | sed -e 's/[^0-9a-f]//'`
		head3=`echo $head2 | sed -e 's/........................................//'`
		if test "x$head3" = "x" -a "x$head" = "x$head2"; then
			GIT_HEAD_ID="$head"
		else
			GIT_HEAD_ID="UNKNOWN"
		fi
	fi
	if test -z "$GIT_REPO" ; then
		GIT_HEAD_VERSION="$GIT_HEAD_ID"
	else
		GIT_HEAD_VERSION="\`GIT_DIR=$GIT_REPO git describe --always\`"
	fi
])


================================================
FILE: src/m4/ax_submodule.m4
================================================
AC_DEFUN([_AX_SUBMODULE],
[

m4_if(m4_bregexp($3,|,choice),choice,
	[AC_ARG_WITH($2,
		[AS_HELP_STRING([--with-$1=$3],
				[Which $1 to use [default=$4]])])])
case "system" in
$3)
	AC_ARG_WITH($2_prefix,
		    [AS_HELP_STRING([--with-$1-prefix=DIR],
				    [Prefix of $1 installation])])
	AC_ARG_WITH($2_exec_prefix,
		    [AS_HELP_STRING([--with-$1-exec-prefix=DIR],
				    [Exec prefix of $1 installation])])
esac
m4_if(m4_bregexp($3,build,build),build,
	[AC_ARG_WITH($2_builddir,
		[AS_HELP_STRING([--with-$1-builddir=DIR],
				[Location of $1 builddir])])])
if test "x$with_$2_prefix" != "x" -a "x$with_$2_exec_prefix" = "x"; then
	with_$2_exec_prefix=$with_$2_prefix
fi
if test "x$with_$2_prefix" != "x" -o "x$with_$2_exec_prefix" != "x"; then
	if test "x$with_$2" != "x" -a "x$with_$2" != "xsystem"; then
		AC_MSG_ERROR([Setting $with_$2_prefix implies use of system $1])
	fi
	with_$2="system"
fi
if test "x$with_$2_builddir" != "x"; then
	if test "x$with_$2" != "x" -a "x$with_$2" != "xbuild"; then
		AC_MSG_ERROR([Setting $with_$2_builddir implies use of build $1])
	fi
	with_$2="build"
	$2_srcdir=`echo @abs_srcdir@ | $with_$2_builddir/config.status --file=-`
	AC_MSG_NOTICE($1 sources in $$2_srcdir)
fi
if test "x$with_$2_exec_prefix" != "x"; then
	export PKG_CONFIG_PATH="$with_$2_exec_prefix/lib/pkgconfig${PKG_CONFIG_PATH+:$PKG_CONFIG_PATH}"
fi
case "$with_$2" in
$3)
	;;
*)
	case "$4" in
	bundled)
		if test -d $srcdir/.git -a \
			-d $srcdir/$1 -a \
			"`cd $srcdir; git submodule status $1 | cut -c1`" = '-'; then
			AC_MSG_WARN([git repo detected, but submodule $1 not initialized])
			AC_MSG_WARN([You may want to run])
			AC_MSG_WARN([	git submodule init])
			AC_MSG_WARN([	git submodule update])
			AC_MSG_WARN([	sh autogen.sh])
		fi
		if test -f $srcdir/$1/configure; then
			with_$2="bundled"
		else
			case "system" in
			$3)
				with_$2="system"
				;;
			*)
				with_$2="no"
				;;
			esac
		fi
		;;
	*)
		with_$2="$4"
		;;
	esac
	;;
esac
AC_MSG_CHECKING([which $1 to use])
AC_MSG_RESULT($with_$2)

])

AC_DEFUN([AX_SUBMODULE], [
	_AX_SUBMODULE($1, m4_bpatsubst([$1],
			[[^_abcdefghijklmnopqrstuvwxyz0123456789]],[_]), $2, $3)
])


================================================
FILE: src/main.cpp
================================================
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <isl/ctx.h>
#include <isl/id.h>
#include <isl/val.h>
#include <isl/set.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/aff.h>
#include <isl/flow.h>
#include <isl/options.h>
#include <isl/schedule.h>
#include <isl/ast.h>
#include <isl/id_to_ast_expr.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/arg.h>
#include <isl/options.h>
#include <pet.h>
#include "ppcg.h"
#include "ppcg_options.h"
//#include "cuda.h"
//#include "opencl.h"
//#include "cpu.h"

#include <iostream>

using namespace std;

int main(int argc, char **argv)
{
	int r;

	r = autosa_main_wrap(argc, argv);

	return r;
}


================================================
FILE: src/ocl_utilities.c
================================================
#include <stdio.h>
#include <stdlib.h>
#include "ocl_utilities.h"

/* Return the OpenCL error string for a given error number.
 */
const char *opencl_error_string(cl_int error)
{
	int errorCount;
	int index;

	static const char *errorString[] = {
		[CL_SUCCESS] = "CL_SUCCESS",
		[-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
		[-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
		[-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
		[-CL_MEM_OBJECT_ALLOCATION_FAILURE] =
			"CL_MEM_OBJECT_ALLOCATION_FAILURE",
		[-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
		[-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
		[-CL_PROFILING_INFO_NOT_AVAILABLE] =
			"CL_PROFILING_INFO_NOT_AVAILABLE",
		[-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
		[-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
		[-CL_IMAGE_FORMAT_NOT_SUPPORTED] =
			"CL_IMAGE_FORMAT_NOT_SUPPORTED",
		[-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
		[-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
		[-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
		[-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
		[-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
		[-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
		[-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
		[-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
		[-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
		[-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
		[-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
		[-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] =
			"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
		[-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
		[-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
		[-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
		[-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
		[-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
		[-CL_INVALID_PROGRAM_EXECUTABLE] =
			"CL_INVALID_PROGRAM_EXECUTABLE",
		[-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
		[-CL_INVALID_KERNEL_DEFINITION] =
			"CL_INVALID_KERNEL_DEFINITION",
		[-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
		[-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
		[-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
		[-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
		[-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
		[-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
		[-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
		[-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
		[-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
		[-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
		[-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
		[-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
		[-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
		[-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
		[-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
		[-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
		[-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
	};

	errorCount = sizeof(errorString) / sizeof(errorString[0]);
	index = -error;

	return (index >= 0 && index < errorCount) ?
		errorString[index] : "Unspecified Error";
}

/* Find a GPU or a CPU associated with the first available platform.
 * If use_gpu is set, then this function first tries to look for a GPU
 * in the first available platform.
 * If this fails or if use_gpu is not set, then it tries to use the CPU.
 */
cl_device_id opencl_create_device(int use_gpu)
{
	cl_platform_id platform;
	cl_device_id dev;
	int err;

	err = clGetPlatformIDs(1, &platform, NULL);
	if (err < 0) {
		fprintf(stderr, "Error %s while looking for a platform.\n",
				opencl_error_string(err));
		exit(1);
	}

	err = CL_DEVICE_NOT_FOUND;
	if (use_gpu)
		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev,
				NULL);
	if (err == CL_DEVICE_NOT_FOUND)
		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev,
				NULL);
	if (err < 0) {
		fprintf(stderr, "Error %s while looking for a device.\n",
				opencl_error_string(err));
		exit(1);
	}
	return dev;
}

/* Create an OpenCL program from a string and compile it.
 */
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
	const char *program_source, size_t program_size,
	const char *opencl_options)
{
	int err;
	cl_program program;
	char *program_log;
	size_t log_size;

	program = clCreateProgramWithSource(ctx, 1,
			&program_source, &program_size, &err);
	if (err < 0) {
		fprintf(stderr, "Could not create the program\n");
		exit(1);
	}
	err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL);
	if (err < 0) {
		fprintf(stderr, "Could not build the program.\n");
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0,
				NULL, &log_size);
		program_log = (char *) malloc(log_size + 1);
		program_log[log_size] = '\0';
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
				log_size + 1, program_log, NULL);
		fprintf(stderr, "%s\n", program_log);
		free(program_log);
		exit(1);
	}
	return program;
}

/* Create an OpenCL program from a source file and compile it.
 */
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
	const char* filename, const char* opencl_options)
{
	cl_program program;
	FILE *program_file;
	char *program_source;
	size_t program_size, read;

	program_file = fopen(filename, "r");
	if (program_file == NULL) {
		fprintf(stderr, "Could not find the source file.\n");
		exit(1);
	}
	fseek(program_file, 0, SEEK_END);
	program_size = ftell(program_file);
	rewind(program_file);
	program_source = (char *) malloc(program_size + 1);
	program_source[program_size] = '\0';
	read = fread(program_source, sizeof(char), program_size, program_file);
	if (read != program_size) {
		fprintf(stderr, "Error while reading the kernel.\n");
		exit(1);
	}
	fclose(program_file);

	program = opencl_build_program_from_string(ctx, dev, program_source,
						program_size, opencl_options);
	free(program_source);

	return program;
}


================================================
FILE: src/ocl_utilities.h
================================================
#ifndef OCL_UTILITIES_H
#define OCL_UTILITIES_H

#if defined(__APPLE__)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif

/* Return the OpenCL error string for a given error number.
 */
const char *opencl_error_string(cl_int error);

/* Find a GPU or a CPU associated with the first available platform.
 * If use_gpu is set, then this function first tries to look for a GPU
 * in the first available platform.
 * If this fails or if use_gpu is not set, then it tries to use the CPU.
 */
cl_device_id opencl_create_device(int use_gpu);

/* Create an OpenCL program from a string and compile it.
 */
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
																						const char *program_source, size_t program_size,
																						const char *opencl_options);

/* Create an OpenCL program from a source file and compile it.
 */
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
																					const char *filename, const char *opencl_options);

#endif


================================================
FILE: src/opencl_test.sh.in
================================================
#!/bin/sh

keep=no

for option; do
	case "$option" in
		--keep)
			keep=yes
			;;
	esac
done

EXEEXT=@EXEEXT@
VERSION=@GIT_HEAD_VERSION@
CC="@CC@"
CFLAGS="--std=gnu99"
srcdir="@srcdir@"

if [ $keep = "yes" ]; then
	OUTDIR="opencl_test.$VERSION"
	mkdir "$OUTDIR" || exit 1
else
	if test "x$TMPDIR" = "x"; then
		TMPDIR=/tmp
	fi
	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
fi

run_tests () {
	subdir=$1
	ppcg_options=$2

	echo Test with PPCG options \'$ppcg_options\'
	mkdir ${OUTDIR}/${subdir} || exit 1
	for i in $srcdir/tests/*.c; do
		echo $i
		name=`basename $i`
		name="${name%.c}"
		out_c="${OUTDIR}/${subdir}/$name.ppcg.c"
		out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT"
		options="--target=opencl --opencl-no-use-gpu $ppcg_options"
		functions="$srcdir/tests/${name}_opencl_functions.cl"
		if test -f $functions; then
			options="$options --opencl-include-file=$functions"
			options="$options --opencl-compiler-options=-I."
		fi
		./ppcg$EXEEXT $options $i -o "$out_c" || exit
		$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
			-I. "$out_c" -o "$out" || exit
		$out || exit
	done
}

run_tests default
run_tests embed --opencl-embed-kernel-code

for i in $srcdir/examples/*.c; do
	echo $i
	name=`basename $i`
	name="${name%.c}"
	exe_ref="${OUTDIR}/$name.ref$EXEEXT"
	gen_ocl="${OUTDIR}/$name.ppcg.c"
	exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
	output_ref="${OUTDIR}/$name.ref.out"
	output_ocl="${OUTDIR}/$name.ppcg.out"
	$CC $CFLAGS $i -o $exe_ref || exit
	./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
		exit
	$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
		"$gen_ocl" -o "$exe_ocl" || exit
	$exe_ref > $output_ref || exit
	$exe_ocl > $output_ocl || exit
	cmp $output_ref $output_ocl || exit
done

if [ $keep = "no" ]; then
	rm -r "${OUTDIR}"
fi


================================================
FILE: src/polybench_test.sh.in
================================================
#!/bin/sh

keep=no
verbose=no

for option; do
	case "$option" in
		--keep)
			keep=yes
			;;
		--verbose)
			verbose=yes
			;;
	esac
done

EXEEXT=@EXEEXT@
DIR=@POLYBENCH_DIR@
VERSION=@GIT_HEAD_VERSION@
SIZE=-DMINI_DATASET
CC="@CC@"
HAVE_OPENCL=@HAVE_OPENCL@
HAVE_OPENMP=@HAVE_OPENMP@
srcdir="@srcdir@"
if [ $keep = "yes" ]; then
	OUTDIR="out.$VERSION"
	mkdir "$OUTDIR" || exit 1
else
	if test "x$TMPDIR" = "x"; then
		TMPDIR=/tmp
	fi
	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
fi
CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS"
CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities"
CFLAGS="-lm --std=gnu99"

echo "Running tests in folder ${OUTDIR}"

run_tests () {
	ext=$1

	ppcg_options=$2
	cc_options=$3

	if [ "x$ppcg_options" = "x" ]; then
		ppcg_option_str="none"
	else
		ppcg_option_str=$ppcg_options
	fi

	if [ "x$cc_options" = "x" ]; then
		cc_option_str="none"
	else
		cc_option_str=$cc_options
	fi

	echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str
	for i in `cat $DIR/utilities/benchmark_list`; do
		echo $i
		name=`basename $i`
		name=${name%.c}
		source_opt="${OUTDIR}/$name.$ext.c"
		prog_orig=${OUTDIR}/$name.orig${EXEEXT}
		prog_opt=${OUTDIR}/$name.$ext${EXEEXT}
		output_orig=${OUTDIR}/$name.orig.out
		output_opt=${OUTDIR}/$name.$ext.out
		dir=`dirname $i`
		if [ $verbose = "yes" ]; then
			echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \
				$CPPFLAGS -o $source_opt $ppcg_options
		fi
		./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \
			-o $source_opt $ppcg_options || exit
		$CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \
			$DIR/utilities/polybench.c $CFLAGS
		$prog_orig 2> $output_orig
		if [ $verbose = "yes" ]; then
			echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \
				-o $prog_opt $DIR/utilities/polybench.c \
				$CFLAGS $cc_options
		fi
		$CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \
			$DIR/utilities/polybench.c $CFLAGS $cc_options || exit

		$prog_opt 2> $output_opt || exit
		cmp $output_orig $output_opt || exit
	done
}

run_tests ppcg "--target=c --tile"
run_tests ppcg_live "--target=c --no-live-range-reordering --tile"

# Test OpenMP code, if compiler supports openmp
if [ $HAVE_OPENMP = "yes" ]; then
	run_tests ppcg_omp "--target=c --openmp" -fopenmp
	echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"'
else
	echo Compiler does not support OpenMP. Skipping OpenMP tests.
fi

if [ $HAVE_OPENCL = "yes" ]; then
	run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \
				"-I $srcdir $srcdir/ocl_utilities.c -lOpenCL"
fi

if [ $keep = "no" ]; then
	rm -r "${OUTDIR}"
fi


================================================
FILE: src/ppcg.c
================================================
/*
 * Copyright 2011      INRIA Saclay
 * Copyright 2013      Ecole Normale Superieure
 * Copyright 2015      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <isl/ctx.h>
#include <isl/id.h>
#include <isl/val.h>
#include <isl/set.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/space.h>
#include <isl/aff.h>
#include <isl/flow.h>
#include <isl/options.h>
#include <isl/schedule.h>
#include <isl/ast.h>
#include <isl/id_to_ast_expr.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/constraint.h>
#include <pet.h>
#include <math.h>
#include "ppcg.h"
#include "ppcg_options.h"
//#include "cuda.h"
//#include "opencl.h"
//#include "cpu.h"
#include "autosa_xilinx_hls_c.h"
#include "autosa_intel_opencl.h"
#include "autosa_catapult_hls_c.h"
#include "autosa_tapa_cpp.h"

//#define _DEBUG

struct options {
	struct pet_options *pet;
	struct ppcg_options *ppcg;
	char *input;
	char *output;
};

//const char *ppcg_version(void);
//static void print_version(void)
//{
//	printf("%s", ppcg_version());
//}

ISL_ARGS_START(struct options, options_args)
ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options")
ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options")
ISL_ARG_STR(struct options, output, 'o', NULL,
	"filename", NULL, "output filename (c and opencl targets)")
ISL_ARG_ARG(struct options, input, "input", NULL)
//ISL_ARG_VERSION(print_version)
ISL_ARGS_END

ISL_ARG_DEF(options, struct options, options_args)

/* Return a pointer to the final path component of "filename" or
 * to "filename" itself if it does not contain any components.
 */
const char *ppcg_base_name(const char *filename)
{
	const char *base;

	base = strrchr(filename, '/');
	if (base)
		return ++base;
	else
		return filename;
}

/* Copy the base name of "input" to "name" and return its length.
 * "name" is not NULL terminated.
 *
 * In particular, remove all leading directory components and
 * the final extension, if any.
 */
int ppcg_extract_base_name(char *name, const char *input)
{
	const char *base;
	const char *ext;
	int len;

	base = ppcg_base_name(input);
	ext = strrchr(base, '.');
	len = ext ? ext - base : strlen(base);

	memcpy(name, base, len);

	return len;
}

/* Does "scop" refer to any arrays that are declared, but not
 * exposed to the code after the scop?
 */
int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop)
{
	int i;

	if (!scop)
		return 0;

	for (i = 0; i < scop->pet->n_array; ++i)
		if (scop->pet->arrays[i]->declared &&
		    !scop->pet->arrays[i]->exposed)
			return 1;

	return 0;
}

/* Collect all variable names that are in use in "scop".
 * In particular, collect all parameters in the context and
 * all the array names.
 * Store these names in an isl_id_to_ast_expr by mapping
 * them to a dummy value (0).
 */
static __isl_give isl_id_to_ast_expr *collect_names(struct pet_scop *scop)
{
	int i, n;
	isl_ctx *ctx;
	isl_ast_expr *zero;
	isl_id_to_ast_expr *names;

	ctx = isl_set_get_ctx(scop->context);

	n = isl_set_dim(scop->context, isl_dim_param);

	names = isl_id_to_ast_expr_alloc(ctx, n + scop->n_array);
	zero = isl_ast_expr_from_val(isl_val_zero(ctx));

	for (i = 0; i < n; ++i) {
		isl_id *id;

		id = isl_set_get_dim_id(scop->context, isl_dim_param, i);
		names = isl_id_to_ast_expr_set(names,
						id, isl_ast_expr_copy(zero));
	}

	for (i = 0; i < scop->n_array; ++i) {
		struct pet_array *array = scop->arrays[i];
		isl_id *id;

		id = isl_set_get_tuple_id(array->extent);
		names = isl_id_to_ast_expr_set(names,
						id, isl_ast_expr_copy(zero));
	}

	isl_ast_expr_free(zero);

	return names;
}

/* Return an isl_id called "prefix%d", with "%d" set to "i".
 * If an isl_id with such a name already appears among the variable names
 * of "scop", then adjust the name to "prefix%d_%d".
 */
static __isl_give isl_id *generate_name(struct ppcg_scop *scop,
	const char *prefix, int i)
{
	int j;
	char name[23];
	isl_ctx *ctx;
	isl_id *id;
	int has_name;

	ctx = isl_set_get_ctx(scop->context);
	snprintf(name, sizeof(name), "%s%d", prefix, i);
	id = isl_id_alloc(ctx, name, NULL);

	j = 0;
	while ((has_name = isl_id_to_ast_expr_has(scop->names, id)) == 1) {
		isl_id_free(id);
		snprintf(name, sizeof(name), "%s%d_%d", prefix, i, j++);
		id = isl_id_alloc(ctx, name, NULL);
	}

	return has_name < 0 ? isl_id_free(id) : id;
}

/* Return a list of "n" isl_ids of the form "prefix%d".
 * If an isl_id with such a name already appears among the variable names
 * of "scop", then adjust the name to "prefix%d_%d".
 */
__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
	int n, const char *prefix)
{
	int i;
	isl_ctx *ctx;
	isl_id_list *names;

	ctx = isl_set_get_ctx(scop->context);
	names = isl_id_list_alloc(ctx, n);
	for (i = 0; i < n; ++i) {
		isl_id *id;

		id = generate_name(scop, prefix, i);
		names = isl_id_list_add(names, id);
	}

	return names;
}

/* Is "stmt" not a kill statement?
 */
static int is_not_kill(struct pet_stmt *stmt)
{
	return !pet_stmt_is_kill(stmt);
}

/* Collect the iteration domains of the statements in "scop" that
 * satisfy "pred".
 */
static __isl_give isl_union_set *collect_domains(struct pet_scop *scop,
	int (*pred)(struct pet_stmt *stmt))
{
	int i;
	isl_set *domain_i;
	isl_union_set *domain;

	if (!scop)
		return NULL;

	domain = isl_union_set_empty(isl_set_get_space(scop->context));

	for (i = 0; i < scop->n_stmt; ++i) {
		struct pet_stmt *stmt = scop->stmts[i];

		if (!pred(stmt))
			continue;

		if (stmt->n_arg > 0)
			isl_die(isl_union_set_get_ctx(domain),
				isl_error_unsupported,
				"data dependent conditions not supported",
				return isl_union_set_free(domain));

		domain_i = isl_set_copy(scop->stmts[i]->domain);
		domain = isl_union_set_add_set(domain, domain_i);
	}

	return domain;
}

/* Collect the iteration domains of the statements in "scop",
 * skipping kill statements.
 */
static __isl_give isl_union_set *collect_non_kill_domains(struct pet_scop *scop)
{
	return collect_domains(scop, &is_not_kill);
}

/* This function is used as a callback to pet_expr_foreach_call_expr
 * to detect if there is any call expression in the input expression.
 * Assign the value 1 to the integer that "user" points to and
 * abort the search since we have found what we were looking for.
 */
static int set_has_call(__isl_keep pet_expr *expr, void *user)
{
	int *has_call = user;

	*has_call = 1;

	return -1;
}

/* Does "expr" contain any call expressions?
 */
static int expr_has_call(__isl_keep pet_expr *expr)
{
	int has_call = 0;

	if (pet_expr_foreach_call_expr(expr, &set_has_call, &has_call) < 0 &&
	    !has_call)
		return -1;

	return has_call;
}

/* This function is a callback for pet_tree_foreach_expr.
 * If "expr" contains any call (sub)expressions, then set *has_call
 * and abort the search.
 */
static int check_call(__isl_keep pet_expr *expr, void *user)
{
	int *has_call = user;

	if (expr_has_call(expr))
		*has_call = 1;

	return *has_call ? -1 : 0;
}

/* Does "stmt" contain any call expressions?
 */
static int has_call(struct pet_stmt *stmt)
{
	int has_call = 0;

	if (pet_tree_foreach_expr(stmt->body, &check_call, &has_call) < 0 &&
	    !has_call)
		return -1;

	return has_call;
}

/* Collect the iteration domains of the statements in "scop"
 * that contain a call expression.
 */
static __isl_give isl_union_set *collect_call_domains(struct pet_scop *scop)
{
	return collect_domains(scop, &has_call);
}

/* Given a union of "tagged" access relations of the form
 *
 *	[S_i[...] -> R_j[]] -> A_k[...]
 *
 * project out the "tags" (R_j[]).
 * That is, return a union of relations of the form
 *
 *	S_i[...] -> A_k[...]
 */
static __isl_give isl_union_map *project_out_tags(
	__isl_take isl_union_map *umap)
{
	return isl_union_map_domain_factor_domain(umap);
}

/* Construct a function from tagged iteration domains to the corresponding
 * untagged iteration domains with as range of the wrapped map in the domain
 * the reference tags that appear in any of the reads, writes or kills.
 * Store the result in ps->tagger.
 *
 * For example, if the statement with iteration space S[i,j]
 * contains two array references R_1[] and R_2[], then ps->tagger will contain
 *
 *	{ [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] }
 */
static void compute_tagger(struct ppcg_scop *ps)
{
	isl_union_map *tagged;
	isl_union_pw_multi_aff *tagger;

	tagged = isl_union_map_copy(ps->tagged_reads);
	tagged = isl_union_map_union(tagged,
				isl_union_map_copy(ps->tagged_may_writes));
	tagged = isl_union_map_union(tagged,
				isl_union_map_copy(ps->tagged_must_kills));
	tagged = isl_union_map_universe(tagged);
	tagged = isl_union_set_unwrap(isl_union_map_domain(tagged));

	tagger = isl_union_map_domain_map_union_pw_multi_aff(tagged);

	ps->tagger = tagger;
}

/* Compute the live out accesses, i.e., the writes that are
 * potentially not killed by any kills or any other writes, and
 * store them in ps->live_out.
 *
 * We compute the "dependence" of any "kill" (an explicit kill
 * or a must write) on any may write.
 * The elements accessed by the may writes with a "depending" kill
 * also accessing the element are definitely killed.
 * The remaining may writes can potentially be live out.
 *
 * The result of the dependence analysis is
 *
 *	{ IW -> [IK -> A] }
 *
 * with IW the instance of the write statement, IK the instance of kill
 * statement and A the element that was killed.
 * The range factor range is
 *
 *	{ IW -> A }
 *
 * containing all such pairs for which there is a kill statement instance,
 * i.e., all pairs that have been killed.
 */
static void compute_live_out(struct ppcg_scop *ps)
{
	isl_schedule *schedule;
	isl_union_map *kills;
	isl_union_map *exposed;
	isl_union_map *covering;
	isl_union_access_info *access;
	isl_union_flow *flow;

	schedule = isl_schedule_copy(ps->schedule);
	kills = isl_union_map_union(isl_union_map_copy(ps->must_writes),
				    isl_union_map_copy(ps->must_kills));
	access = isl_union_access_info_from_sink(kills);
	access = isl_union_access_info_set_may_source(access,
				    isl_union_map_copy(ps->may_writes));
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	covering = isl_union_flow_get_full_may_dependence(flow);
	isl_union_flow_free(flow);

	covering = isl_union_map_range_factor_range(covering);
	exposed = isl_union_map_copy(ps->may_writes);
	exposed = isl_union_map_subtract(exposed, covering);
	ps->live_out = exposed;
}

/* Compute the tagged flow dependences and the live_in accesses and store
 * the results in ps->tagged_dep_flow and ps->live_in.
 *
 * Both must-writes and must-kills are allowed to kill dependences
 * from earlier writes to subsequent reads.
 * The must-kills are not included in the potential sources, though.
 * The flow dependences with a must-kill as source would
 * reflect possibly uninitialized reads.
 * No dependences need to be introduced to protect such reads
 * (other than those imposed by potential flows from may writes
 * that follow the kill).  Those flow dependences are therefore not needed.
 * The dead code elimination also assumes
 * the flow sources are non-kill instances.
 */
static void compute_tagged_flow_dep_only(struct ppcg_scop *ps)
{
	isl_union_pw_multi_aff *tagger;
	isl_schedule *schedule;
	isl_union_map *live_in;
	isl_union_access_info *access;
	isl_union_flow *flow;
	isl_union_map *must_source;
	isl_union_map *kills;
	isl_union_map *tagged_flow;

	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
	schedule = isl_schedule_copy(ps->schedule);
	schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
	kills = isl_union_map_copy(ps->tagged_must_kills);
	must_source = isl_union_map_copy(ps->tagged_must_writes);
	kills = isl_union_map_union(kills, must_source);
	access = isl_union_access_info_from_sink(
				isl_union_map_copy(ps->tagged_reads));
	access = isl_union_access_info_set_kill(access, kills);
	access = isl_union_access_info_set_may_source(access,
				isl_union_map_copy(ps->tagged_may_writes));
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	tagged_flow = isl_union_flow_get_may_dependence(flow);
	ps->tagged_dep_flow = tagged_flow;
	live_in = isl_union_flow_get_may_no_source(flow);
	ps->live_in = project_out_tags(live_in);
	isl_union_flow_free(flow);
}

/* Compute ps->dep_flow from ps->tagged_dep_flow
 * by projecting out the reference tags.
 */
static void derive_flow_dep_from_tagged_flow_dep(struct ppcg_scop *ps)
{
	ps->dep_flow = isl_union_map_copy(ps->tagged_dep_flow);
	ps->dep_flow = isl_union_map_factor_domain(ps->dep_flow);
}

/* Compute the flow dependences and the live_in accesses and store
 * the results in ps->dep_flow and ps->live_in.
 * A copy of the flow dependences, tagged with the reference tags
 * is stored in ps->tagged_dep_flow.
 *
 * We first compute ps->tagged_dep_flow, i.e., the tagged flow dependences
 * and then project out the tags.
 */
static void compute_tagged_flow_dep(struct ppcg_scop *ps)
{
	compute_tagged_flow_dep_only(ps);
	derive_flow_dep_from_tagged_flow_dep(ps);
}

/* Compute the order dependences that prevent the potential live ranges
 * from overlapping.
 *
 * In particular, construct a union of relations
 *
 *	[R[...] -> R_1[]] -> [W[...] -> R_2[]]
 *
 * where [R[...] -> R_1[]] is the range of one or more live ranges
 * (i.e., a read) and [W[...] -> R_2[]] is the domain of one or more
 * live ranges (i.e., a write).  Moreover, the read and the write
 * access the same memory element and the read occurs before the write
 * in the original schedule.
 * The scheduler allows some of these dependences to be violated, provided
 * the adjacent live ranges are all local (i.e., their domain and range
 * are mapped to the same point by the current schedule band).
 *
 * Note that if a live range is not local, then we need to make
 * sure it does not overlap with _any_ other live range, and not
 * just with the "previous" and/or the "next" live range.
 * We therefore add order dependences between reads and
 * _any_ later potential write.
 *
 * We also need to be careful about writes without a corresponding read.
 * They are already prevented from moving past non-local preceding
 * intervals, but we also need to prevent them from moving past non-local
 * following intervals.  We therefore also add order dependences from
 * potential writes that do not appear in any intervals
 * to all later potential writes.
 * Note that dead code elimination should have removed most of these
 * dead writes, but the dead code elimination may not remove all dead writes,
 * so we need to consider them to be safe.
 *
 * The order dependences are computed by computing the "dataflow"
 * from the above unmatched writes and the reads to the may writes.
 * The unmatched writes and the reads are treated as may sources
 * such that they would not kill order dependences from earlier
 * such writes and reads.
 */
static void compute_order_dependences(struct ppcg_scop *ps)
{
	isl_union_map *reads;
	isl_union_map *shared_access;
	isl_union_set *matched;
	isl_union_map *unmatched;
	isl_union_pw_multi_aff *tagger;
	isl_schedule *schedule;
	isl_union_access_info *access;
	isl_union_flow *flow;

	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
	schedule = isl_schedule_copy(ps->schedule);
	schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
	reads = isl_union_map_copy(ps->tagged_reads);
	matched = isl_union_map_domain(isl_union_map_copy(ps->tagged_dep_flow));
	unmatched = isl_union_map_copy(ps->tagged_may_writes);
	unmatched = isl_union_map_subtract_domain(unmatched, matched);
	reads = isl_union_map_union(reads, unmatched);
	access = isl_union_access_info_from_sink(
				isl_union_map_copy(ps->tagged_may_writes));
	access = isl_union_access_info_set_may_source(access, reads);
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	shared_access = isl_union_flow_get_may_dependence(flow);
	isl_union_flow_free(flow);

	ps->tagged_dep_order = isl_union_map_copy(shared_access);
	ps->dep_order = isl_union_map_factor_domain(shared_access);
}

/* Compute those validity dependences of the program represented by "scop"
 * that should be unconditionally enforced even when live-range reordering
 * is used.
 *
 * In particular, compute the external false dependences
 * as well as order dependences between sources with the same sink.
 * The anti-dependences are already taken care of by the order dependences.
 * The external false dependences are only used to ensure that live-in and
 * live-out data is not overwritten by any writes inside the scop.
 * The independences are removed from the external false dependences,
 * but not from the order dependences between sources with the same sink.
 *
 * In particular, the reads from live-in data need to precede any
 * later write to the same memory element.
 * As to live-out data, the last writes need to remain the last writes.
 * That is, any earlier write in the original schedule needs to precede
 * the last write to the same memory element in the computed schedule.
 * The possible last writes have been computed by compute_live_out.
 * They may include kills, but if the last access is a kill,
 * then the corresponding dependences will effectively be ignored
 * since we do not schedule any kill statements.
 *
 * Note that the set of live-in and live-out accesses may be
 * an overapproximation.  There may therefore be potential writes
 * before a live-in access and after a live-out access.
 *
 * In the presence of may-writes, there may be multiple live-ranges
 * with the same sink, accessing the same memory element.
 * The sources of these live-ranges need to be executed
 * in the same relative order as in the original program
 * since we do not know which of the may-writes will actually
 * perform a write.  Consider all sources that share a sink and
 * that may write to the same memory element and compute
 * the order dependences among them.
 */
static void compute_forced_dependences(struct ppcg_scop *ps)
{
	isl_union_map *shared_access;
	isl_union_map *exposed;
	isl_union_map *live_in;
	isl_union_map *sink_access;
	isl_union_map *shared_sink;
	isl_union_access_info *access;
	isl_union_flow *flow;
	isl_schedule *schedule;

	exposed = isl_union_map_copy(ps->live_out);
	schedule = isl_schedule_copy(ps->schedule);
	access = isl_union_access_info_from_sink(exposed);
	access = isl_union_access_info_set_may_source(access,
				isl_union_map_copy(ps->may_writes));
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	shared_access = isl_union_flow_get_may_dependence(flow);
	isl_union_flow_free(flow);
	ps->dep_forced = shared_access;

	schedule = isl_schedule_copy(ps->schedule);
	access = isl_union_access_info_from_sink(
				isl_union_map_copy(ps->may_writes));
	access = isl_union_access_info_set_may_source(access,
				isl_union_map_copy(ps->live_in));
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	live_in = isl_union_flow_get_may_dependence(flow);
	isl_union_flow_free(flow);

	ps->dep_forced = isl_union_map_union(ps->dep_forced, live_in);
	ps->dep_forced = isl_union_map_subtract(ps->dep_forced,
				isl_union_map_copy(ps->independence));

	schedule = isl_schedule_copy(ps->schedule);
	sink_access = isl_union_map_copy(ps->tagged_dep_flow);
	sink_access = isl_union_map_range_product(sink_access,
				isl_union_map_copy(ps->tagged_may_writes));
	sink_access = isl_union_map_domain_factor_domain(sink_access);
	access = isl_union_access_info_from_sink(
				isl_union_map_copy(sink_access));
	access = isl_union_access_info_set_may_source(access, sink_access);
	access = isl_union_access_info_set_schedule(access, schedule);
	flow = isl_union_access_info_compute_flow(access);
	shared_sink = isl_union_flow_get_may_dependence(flow);
	isl_union_flow_free(flow);
	ps->dep_forced = isl_union_map_union(ps->dep_forced, shared_sink);
}

/* Remove independence from the tagged flow dependences.
 * Since the user has guaranteed that source and sink of an independence
 * can be executed in any order, there cannot be a flow dependence
 * between them, so they can be removed from the set of flow dependences.
 * However, if the source of such a flow dependence is a must write,
 * then it may have killed other potential sources, which would have
 * to be recovered if we were to remove those flow dependences.
 * We therefore keep the flow dependences that originate in a must write,
 * even if it corresponds to a known independence.
 */
static void remove_independences_from_tagged_flow(struct ppcg_scop *ps)
{
	isl_union_map *tf;
	isl_union_set *indep;
	isl_union_set *mw;

	tf = isl_union_map_copy(ps->tagged_dep_flow);
	tf = isl_union_map_zip(tf);
	indep = isl_union_map_wrap(isl_union_map_copy(ps->independence));
	tf = isl_union_map_intersect_domain(tf, indep);
	tf = isl_union_map_zip(tf);
	mw = isl_union_map_domain(isl_union_map_copy(ps->tagged_must_writes));
	tf = isl_union_map_subtract_domain(tf, mw);
	ps->tagged_dep_flow = isl_union_map_subtract(ps->tagged_dep_flow, tf);
}

/* Compute the dependences of the program represented by "scop"
 * in case live range reordering is allowed.
 *
 * We compute the actual live ranges and the corresponding order
 * false dependences.
 *
 * The independences are removed from the flow dependences
 * (provided the source is not a must-write) as well as
 * from the external false dependences (by compute_forced_dependences).
 */
static void compute_live_range_reordering_dependences(struct ppcg_scop *ps)
{
	compute_tagged_flow_dep_only(ps);
	remove_independences_from_tagged_flow(ps);
	derive_flow_dep_from_tagged_flow_dep(ps);
	compute_order_dependences(ps);
	compute_forced_dependences(ps);
}

/* Compute the potential flow dependences and the potential live in
 * accesses.
 *
 * Both must-writes and must-kills are allowed to kill dependences
 * from earlier writes to subsequent reads, as in compute_tagged_flow_dep_only.
 */
static void compute_flow_dep(struct ppcg_scop *ps)
{
	isl_union_access_info *access;
	isl_union_flow *flow;
	isl_union_map *kills, *must_writes;

	access = isl_union_access_info_from_sink(isl_union_map_copy(ps->reads));
	kills = isl_union_map_copy(ps->must_kills);
	must_writes = isl_union_map_copy(ps->must_writes);
	kills = isl_union_map_union(kills, must_writes);
	access = isl_union_access_info_set_kill(access, kills);
	access = isl_union_access_info_set_may_source(access,
				isl_union_map_copy(ps->may_writes));
	access = isl_union_access_info_set_schedule(access,
				isl_schedule_copy(ps->schedule));
	flow = isl_union_access_info_compute_flow(access);

	ps->dep_flow = isl_union_flow_get_may_dependence(flow);
	ps->live_in = isl_union_flow_get_may_no_source(flow);
	isl_union_flow_free(flow);
}

/* Examine if the access "map" is an external access, i.e., it is not
 * associated with flow deps.
 */
static isl_bool is_external_access(__isl_keep isl_map *map, void *user) 
{
  isl_map *read_access = (isl_map *)(user);
  /* The read access is in the format of
   * {[S1[] -> pet_ref1] -> A[]}
   */
  isl_space *read_access_space = isl_map_get_space(read_access);
  /* Factor the read access to
   * {pet_ref[] -> A[]}
   */
  read_access_space = isl_space_domain_factor_range(read_access_space);
  const char *read_access_name = isl_space_get_tuple_name(read_access_space, isl_dim_in);

  /* The flow dpendence is in the format of
   * {[S1[] -> pet_ref1] -> [S1[] -> pet_ref2]}
   * We factor it to
   * {pet_ref1[] -> pet_ref2[]}
   */
  isl_map *dep = isl_map_factor_range(isl_map_copy(map));
  isl_space *dep_space = isl_map_get_space(dep);
  const char *dep_src_name = isl_space_get_tuple_name(dep_space, isl_dim_in);
  const char *dep_sink_name = isl_space_get_tuple_name(dep_space, isl_dim_out);
  isl_map_free(dep);

  /* Compare if the read access name equals either source or sink access name
   * in the flow dependence.
   */
  if (!strcmp(read_access_name, dep_src_name) || !strcmp(read_access_name, dep_sink_name)) {
    isl_space_free(read_access_space);
    isl_space_free(dep_space);
    return isl_bool_false;
  } else {
    isl_space_free(read_access_space);
    isl_space_free(dep_space);   
    return isl_bool_true;
  }
}

/* This function takes the tagged access relation in the format of
 * {[S1[] -> pet_ref..] -> A[i,j]}
 * and returns the access matrix.
 */
static __isl_give isl_mat *get_acc_mat_from_tagged_acc(__isl_keep isl_map *map) 
{
  isl_map *acc = isl_map_domain_factor_domain(isl_map_copy(map));
  /* The parameters and constants are truncated. */
  isl_mat *acc_mat = isl_mat_alloc(isl_map_get_ctx(acc), isl_map_dim(acc, isl_dim_out), isl_map_dim(acc, isl_dim_in));
  /* Fill in the matrix. */
  assert(isl_map_n_basic_map(acc) == 1);
  isl_basic_map_list *bmap_list = isl_map_get_basic_map_list(acc);
  isl_basic_map *bmap = isl_basic_map_list_get_basic_map(bmap_list, 0);

  isl_mat *eq_mat = isl_basic_map_equalities_matrix(bmap, isl_dim_out, isl_dim_in, isl_dim_div, isl_dim_param, isl_dim_cst);
  isl_mat *ieq_mat = isl_basic_map_inequalities_matrix(bmap, isl_dim_out, isl_dim_in, isl_dim_div, isl_dim_param, isl_dim_cst);

  for (int row = 0; row < isl_mat_rows(eq_mat); row++) {
    isl_val *sum = isl_val_zero(isl_basic_map_get_ctx(bmap));
    int index;
    for (int col = 0; col < isl_basic_map_dim(bmap, isl_dim_out); col++) {
      sum = isl_val_add(sum, isl_val_abs(isl_mat_get_element_val(eq_mat, row, col)));
      isl_val *mat_val = isl_mat_get_element_val(eq_mat, row, col);
      if (isl_val_is_one(mat_val)) {
        index = col;
      }
      isl_val_free(mat_val);
    }
    if (!isl_val_is_one(sum)) {
      isl_val_free(sum);
      continue;
    }
    for (int col = 0; col < isl_basic_map_dim(bmap, isl_dim_in); col++) {
      isl_mat_set_element_val(acc_mat, index, col, isl_val_neg(isl_mat_get_element_val(eq_mat, row, col + isl_basic_map_dim(bmap, isl_dim_out))));
    }
    isl_val_free(sum);
  }

  isl_mat_free(eq_mat);
  isl_mat_free(ieq_mat);
  isl_map_free(acc);

  isl_basic_map_list_free(bmap_list);
  isl_basic_map_free(bmap);

  return acc_mat;
}

/* There could be mulitple solutions (basis) in the null space. 
 * This function finds one solution based on the heuristics below:
 * Dependence distance with the simpler pattern is preferred.
 *  
 * We first count the non-zero components in the dependence vector, 
 * and select those with the least non-zero components. 
 * Then, among those with the same number of non-zero components, 
 * we select ones with the least absolute value of the score computed by:
 * sum(abs(ele_of_dep) * 2^(loop_depth)).
 * We favor non-zero components at the upper level, since they are more likely
 * to be carried by the space loops.
 *
 * For T2S only:
 * At the second phase of tiled T2S code generation,
 * the coefficients  at space loop dimensions should be no less than zero.
 * For now, we will set any dependence vector with negative coefficient with a negative
 * score -1.
 * 
 * Temporary: We only allow one non-zero component in the reuse vector to simplify
 * the generation of hardware. We may relax it in the future.
 */
static int rar_sol_smart_pick(
  __isl_keep isl_mat *mat, struct ppcg_scop *ps, int *n_candidates, int *n_default, int user_choice)
{
  int score[isl_mat_cols(mat)];
  int depth = isl_mat_rows(mat);
  int pick_idx = -1;
  int min_score = 0;  
  int min_non_zero_cnt = -1;
  int non_zero_cnts[isl_mat_cols(mat)];

  for (int c = 0; c < isl_mat_cols(mat); c++) {
    int non_zero_cnt = 0;
    for (int r = 0; r < isl_mat_rows(mat); r++) {
      isl_val *val = isl_mat_get_element_val(mat, r, c);
      long val_int = isl_val_get_num_si(val);
      isl_val_free(val);
      if (val_int != 0)
        non_zero_cnt++;
    }
    non_zero_cnts[c] = non_zero_cnt;
    if (min_non_zero_cnt == -1) {
      min_non_zero_cnt = non_zero_cnt;    
    } else {
      if (non_zero_cnt < min_non_zero_cnt)
        min_non_zero_cnt = non_zero_cnt;
    }
  }

  /* Temporary: We only allow one non-zero component in the reuse vector to simplify
   * the generation of hardware. We may relax it in the future.
   */
  if (min_non_zero_cnt > 1) {
	return pick_idx;
  }
  
  for (int c = 0; c < isl_mat_cols(mat); c++) {
    score[c] = 0; 
    for (int r = 0; r < isl_mat_rows(mat); r++) {
      isl_val *val = isl_mat_get_element_val(mat, r, c);
      long val_int = isl_val_get_num_si(val);
      score[c] += abs(val_int) * pow(2, r);    
      isl_val_free(val);
      if (ps->options->autosa->t2s_tile && 
						ps->options->autosa->t2s_tile_phase == 1) {
        if (val_int < 0) {
          score[c] = -1;
          break;
        }
      }
    }
    if (score[c] >= 0 && non_zero_cnts[c] == min_non_zero_cnt) {
	  if (user_choice == -1) {
	    printf("[AutoSA] Candidate %d: ", *n_candidates);
	    isl_printer *p_tmp = isl_printer_to_file(isl_mat_get_ctx(mat), stdout);
	    isl_vec *sol_tmp = isl_vec_alloc(isl_mat_get_ctx(mat), isl_mat_rows(mat));
	    for (int r = 0; r < isl_mat_rows(mat); r++) {
	  	  sol_tmp = isl_vec_set_element_val(sol_tmp, r, isl_mat_get_element_val(mat, r, c));
	    }
	    p_tmp = isl_printer_print_vec(p_tmp, sol_tmp);	  
	    isl_printer_free(p_tmp);
	    isl_vec_free(sol_tmp);
	    printf("\n");
		if (pick_idx == -1) {
          pick_idx = c;
          min_score = score[c];
		  *n_default = *n_candidates;
        } else {
          if (min_score > score[c]) {
            pick_idx = c;
            min_score = score[c];
		    *n_default = *n_candidates;
          }
        }
	  }	else {
	    if (user_choice == *n_candidates) {
		  pick_idx = c;
	      break;
		}
	  }
	  (*n_candidates)++;
    }
  }

  return pick_idx;
}

/* Construct a pseudo RAR dependence that is an identity map of the read access. */
static __isl_give isl_map *construct_pseudo_dep_rar(__isl_keep isl_map *map)
{
	isl_set *set;

//#ifdef _DEBUG
//	DBGMAP(stdout, map, isl_map_get_ctx(map));
//#endif
	set = isl_map_domain(isl_map_copy(map));
	isl_map *dep_map;
	dep_map = isl_set_identity(set);
//#ifdef _DEBUG
//	DBGMAP(stdout, dep_map, isl_map_get_ctx(dep_map));
//#endif

	return dep_map;
}

/* Construct the RAR dependence based on the dependence vector in "sol" and the 
 * access relation "map".
 */
static __isl_give isl_map *construct_dep_rar(__isl_keep isl_vec *sol, 
	__isl_keep isl_map *map) 
{
  /* Build the space. */
  isl_space *space = isl_map_get_space(map);
  space = isl_space_domain(space);
  isl_space *space_d = isl_space_factor_domain(isl_space_copy(space));
  isl_space *space_r = isl_space_factor_range(isl_space_copy(space));

  isl_space *space_d_d = isl_space_map_from_domain_and_range(space_d, isl_space_copy(space_d));
  isl_space *space_r_r = isl_space_map_from_domain_and_range(space_r, isl_space_copy(space_r));

  isl_space_free(space);
  space = isl_space_product(space_d_d, space_r_r);
  isl_map *dep_map = isl_map_universe(isl_space_copy(space));

  /* Add the dep vector constraint. */
  isl_local_space *ls = isl_local_space_from_space(space);
  for (int i = 0; i < isl_vec_size(sol); i++) {
    isl_constraint *cst = isl_constraint_alloc_equality(isl_local_space_copy(ls));
    isl_constraint_set_coefficient_si(cst, isl_dim_in, i, 1);
    isl_constraint_set_coefficient_si(cst, isl_dim_out, i, -1);
    isl_constraint_set_constant_val(cst, isl_vec_get_element_val(sol, i));
    dep_map = isl_map_add_constraint(dep_map, cst);
  }

  /* Add the iteration domain constraints. */  
  isl_set *domain = isl_map_domain(isl_map_copy(map));
  isl_map *new_map = isl_map_from_domain_and_range(domain, isl_set_copy(domain));
  dep_map = isl_map_intersect(dep_map, new_map);

  isl_local_space_free(ls);

  return dep_map;
}

struct autosa_extract_size_data
{
  const char *type;
  isl_set *res;
};

/* This function is called for each set in a union_set.
 * If the name of the set matches data->type, we store the
 * set in data->res.
 */
static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user)
{
  struct autosa_extract_size_data *data = (struct autosa_extract_size_data *)user;
  const char *name;

  name = isl_set_get_tuple_name(size);
  if (name && !strcmp(name, data->type))
  {
    data->res = size;
    return isl_stat_error;
  }

  isl_set_free(size);
  return isl_stat_ok;
}

static __isl_give isl_set *extract_sa_sizes(__isl_keep isl_union_map *sizes,
                                     const char *type)
{
  isl_space *space;
  isl_set *dom;
  isl_union_set *local_sizes;
  struct autosa_extract_size_data data = {type, NULL};

  if (!sizes)
    return NULL;

  space = isl_union_map_get_space(sizes);
  space = isl_space_set_from_params(space);  
  space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
  dom = isl_set_universe(space);  

  local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
                                    isl_union_map_copy(sizes));
  isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
  isl_union_set_free(local_sizes);
  return data.res;
}

static __isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
{
  if (!str)
    return NULL;
  return isl_union_map_read_from_str(ctx, str);
}

static int read_select_rar_dep_choices(struct ppcg_scop *ps, __isl_keep isl_map *map)
{
  /* Extract the reference name */
  isl_set *domain = isl_map_domain(isl_map_copy(map));
  isl_map *domain_map = isl_set_unwrap(domain);
  isl_space *space = isl_map_get_space(domain_map);
  isl_map_free(domain_map);  
  const char *ref_name = isl_space_get_tuple_name(space, isl_dim_out);
  isl_space_free(space);  
  isl_union_map *sizes = extract_sizes_from_str(isl_map_get_ctx(map), ps->options->autosa->select_rar_dep);
  isl_set *size = extract_sa_sizes(sizes, ref_name);
  isl_union_map_free(sizes);
  int ret = -1;
  if (size) {
    isl_val *v = isl_set_plain_get_val_if_fixed(size, isl_dim_set, 0);
    ret = isl_val_get_num_si(v);
	isl_val_free(v);	
  }
  isl_set_free(size);

  return ret;	
}

/* Builds the RAR dependence for the given access "map".
 * First we examine the access is an external access (not assoiciated with
 * any flow dependence). Next, we compute the null space of the access matrix.
 * At present, we will take one of the solutions in the null space as the 
 * RAR dependence for the given array access. 
 */
static isl_stat build_rar_dep(__isl_take isl_map *map, void *user) {
  struct ppcg_scop *ps = (struct ppcg_scop *)(user);
  isl_map *tagged_dep_rar;
  /* Examine if the read access is an external access. */
  isl_union_map *tagged_dep_flow = ps->tagged_dep_flow;
  isl_bool is_external = isl_union_map_every_map(tagged_dep_flow, &is_external_access, map);
  if (!is_external) {
    isl_map_free(map);
    return isl_stat_ok;
  }

  /* Take the access function and compute the null space */
  isl_mat *acc_mat = get_acc_mat_from_tagged_acc(map); 
  isl_mat *acc_null_mat = isl_mat_right_kernel(acc_mat);
  int nsol = isl_mat_cols(acc_null_mat);  
  if (nsol > 0) {
  	/* Build the RAR dependence.
   	 * TODO: Temporary solution. We will construnct the RAR dep
     * using one independent solution based on hueristics.
     */
	int n_candidates = 0;
	{
	  printf("[AutoSA] Extract RAR dep for the array access: ");
	  isl_space *space = isl_map_get_space(map);
	  isl_map *map_tmp = isl_map_universe(space);
	  isl_printer *p_tmp = isl_printer_to_file(isl_map_get_ctx(map_tmp), stdout);
	  p_tmp = isl_printer_print_map(p_tmp, map_tmp);
	  isl_printer_free(p_tmp);
	  isl_map_free(map_tmp);
	  printf("\n");						
	}
	int default_candidate = -1;
    int col = rar_sol_smart_pick(acc_null_mat, ps, &n_candidates, &default_candidate, -1);
	if (col >= 0) {
	  /* Check if users have specified any choice. */
	  int user_choice = read_select_rar_dep_choices(ps, map);
      if (n_candidates > 1) {
		printf("[AutoSA] Found more than one legal RAR deps. ");
		if (user_choice == -1)
		  printf("Candidate %d is used by default.\n", default_candidate);
		else {
		  printf("Candidate %d is used.\n", user_choice);
		  n_candidates = 0;
		  col = rar_sol_smart_pick(acc_null_mat, ps, &n_candidates, &default_candidate, user_choice);
		}
	  }

      isl_vec *sol = isl_vec_alloc(isl_map_get_ctx(map), isl_mat_rows(acc_null_mat));
      for (int row = 0; row < isl_mat_rows(acc_null_mat); row++) {
        sol = isl_vec_set_element_val(sol, row, isl_mat_get_element_val(acc_null_mat, row, col));
      }
	  //DBGVEC(stdout, sol, isl_vec_get_ctx(sol));
      tagged_dep_rar = construct_dep_rar(sol, map);
//	  DBGMAP(stdout, tagged_dep_rar, isl_map_get_ctx(tagged_dep_rar));
      isl_vec_free(sol);      

	  /* Test if the dependence is empty. In such case, we will build an identity map 
	   * serving as a pseudo-dependence. 
	   */
	  if (isl_map_is_empty(tagged_dep_rar)) {
		isl_map_free(tagged_dep_rar);
		col = -1;
	  } 
	}

	if (col < 0) {
	  tagged_dep_rar = construct_pseudo_dep_rar(map);
	}

    ps->tagged_dep_rar = isl_union_map_union(ps->tagged_dep_rar, isl_union_map_from_map(tagged_dep_rar));
  } else {	
	/* Since there is no data reuse opportunity, we will build an identity map here. */
	tagged_dep_rar = construct_pseudo_dep_rar(map);
	ps->tagged_dep_rar = isl_union_map_union(ps->tagged_dep_rar, isl_union_map_from_map(tagged_dep_rar));
  }

  isl_mat_free(acc_null_mat);
  isl_map_free(map);
  return isl_stat_ok;
}

/* Compute ps->dep_rar from ps->tagged_dep_rar
 * by projecting out the reference tags.
 */
static void derive_rar_dep_from_tagged_rar_dep(struct ppcg_scop *ps)
{
  ps->dep_rar = isl_union_map_copy(ps->tagged_dep_rar);
  ps->dep_rar = isl_union_map_factor_domain(ps->dep_rar);
}

/* Computed the tagged RAR dependence and store the results in
 * ps->tagged_rar_flow.
 */
static void compute_tagged_rar_dep_only(struct ppcg_scop *ps)
{
  /* For each read access, if the read is an external read access,
   * compute the null space of the access function, and 
   * construct the RAR deps based on the independent solution in the null space.
   */
  isl_union_map *tagged_reads = ps->tagged_reads;
  isl_union_map_foreach_map(tagged_reads, &build_rar_dep, ps);
}

/* Compute the RAR dependence for each externel read access.
 * The results are stored in ps->dep_rar.
 * A copy of the RAR dependences, tagged with the reference tags 
 * is stored in ps->tagged_dep_rar.
 *
 * We first compute ps->tagged_dep_rar, i.e., the tagged RAR dependences
 * and then project out the tags.
 */
static void compute_tagged_rar_dep(struct ppcg_scop *ps)
{
  isl_space *space = isl_union_map_get_space(ps->tagged_dep_flow);
  ps->tagged_dep_rar = isl_union_map_empty(
			isl_space_set_alloc(isl_union_map_get_ctx(ps->tagged_dep_flow),
        isl_space_dim(space, isl_dim_param), 0));
  isl_space_free(space);
  compute_tagged_rar_dep_only(ps);
  derive_rar_dep_from_tagged_rar_dep(ps);
}

static void compute_tagged_waw_dep_only(struct ppcg_scop *ps)
{
  isl_union_pw_multi_aff *tagger;
  isl_schedule *schedule;
  isl_union_map *kills;
  isl_union_map *must_source;
  isl_union_access_info *access;
  isl_union_flow *flow;
  isl_union_map *tagged_flow;

  tagger = isl_union_pw_multi_aff_copy(ps->tagger);
  schedule = isl_schedule_copy(ps->schedule);
  schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
  kills = isl_union_map_copy(ps->tagged_must_kills);
  must_source = isl_union_map_copy(ps->tagged_must_writes);
  kills = isl_union_map_union(kills, must_source);
  access = isl_union_access_info_from_sink(
      isl_union_map_copy(ps->tagged_may_writes));
  access = isl_union_access_info_set_kill(access, kills);
  access = isl_union_access_info_set_may_source(access, 
      isl_union_map_copy(ps->tagged_may_writes));
  access = isl_union_access_info_set_schedule(access, schedule);
  flow = isl_union_access_info_compute_flow(access);
  tagged_flow = isl_union_flow_get_may_dependence(flow);
  ps->tagged_dep_waw = tagged_flow;
  isl_union_flow_free(flow);
}

static void derive_waw_dep_from_tagged_waw_dep(struct ppcg_scop *ps)
{
  ps->dep_waw = isl_union_map_copy(ps->tagged_dep_waw);
  ps->dep_waw = isl_union_map_factor_domain(ps->dep_waw);
}

/* Compute the WAW dependence for each intermediate write access.
 * The results are stored in ps->dep_waw.
 * A copy of the waw dependences, tagged with the reference tags 
 * is stored in ps->tagged_dep_waw.
 *
 * We first compute ps->tagged_dep_waw, i.e., the tagged WAW dependences
 * and then project out the tags. 
 */
static void compute_tagged_waw_dep(struct ppcg_scop *ps)
{
  compute_tagged_waw_dep_only(ps); 
  derive_waw_dep_from_tagged_waw_dep(ps);
}

/* Compute the dependences of the program represented by "scop".
 * Store the computed potential flow dependences
 * in scop->dep_flow and the reads with potentially no corresponding writes in
 * scop->live_in.
 * Store the potential live out accesses in scop->live_out.
 * Store the potential false (anti and output) dependences in scop->dep_false.
 *
 * If live range reordering is allowed, then we compute a separate
 * set of order dependences and a set of external false dependences
 * in compute_live_range_reordering_dependences.
 * 
 * Extended by AutoSA: Add analysis for WAW and RAR dependences.
 */
static void compute_dependences(struct ppcg_scop *scop)
{
	isl_union_map *may_source;
	isl_union_access_info *access;
	isl_union_flow *flow;

	if (!scop)
		return;

	compute_live_out(scop);

	if (scop->options->live_range_reordering)
		compute_live_range_reordering_dependences(scop);
	else if (scop->options->target != PPCG_TARGET_C)
		compute_tagged_flow_dep(scop);
	else
		compute_flow_dep(scop);
	
	may_source = isl_union_map_union(isl_union_map_copy(scop->may_writes),
					isl_union_map_copy(scop->reads));
	access = isl_union_access_info_from_sink(
				isl_union_map_copy(scop->may_writes));
	//access = isl_union_access_info_set_kill(access,
	//			isl_union_map_copy(scop->must_writes));
	access = isl_union_access_info_set_kill(access,
					isl_union_map_union(isl_union_map_copy(scop->must_writes), 
					                    isl_union_map_copy(scop->must_kills)));
	access = isl_union_access_info_set_may_source(access, may_source);
	access = isl_union_access_info_set_schedule(access,
				isl_schedule_copy(scop->schedule));
	flow = isl_union_access_info_compute_flow(access);

	scop->dep_false = isl_union_flow_get_may_dependence(flow);
	scop->dep_false = isl_union_map_coalesce(scop->dep_false);
	isl_union_flow_free(flow);

	/* AutoSA Extended */
	if (scop->options->autosa->autosa) {
		compute_tagged_rar_dep(scop);
		compute_tagged_waw_dep(scop);			
	}
	/* AutoSA Extended */
}

/* Eliminate dead code from ps->domain.
 *
 * In particular, intersect both ps->domain and the domain of
 * ps->schedule with the (parts of) iteration
 * domains that are needed to produce the output or for statement
 * iterations that call functions.
 * Also intersect the range of the dataflow dependences with
 * this domain such that the removed instances will no longer
 * be considered as targets of dataflow.
 *
 * We start with the iteration domains that call functions
 * and the set of iterations that last write to an array
 * (except those that are later killed).
 *
 * Then we add those statement iterations that produce
 * something needed by the "live" statements iterations.
 * We keep doing this until no more statement iterations can be added.
 * To ensure that the procedure terminates, we compute the affine
 * hull of the live iterations (bounded to the original iteration
 * domains) each time we have added extra iterations.
 */
static void eliminate_dead_code(struct ppcg_scop *ps)
{
	isl_union_set *live;
	isl_union_map *dep;
	isl_union_pw_multi_aff *tagger;

	live = isl_union_map_domain(isl_union_map_copy(ps->live_out));
	if (!isl_union_set_is_empty(ps->call)) {
		live = isl_union_set_union(live, isl_union_set_copy(ps->call));
		live = isl_union_set_coalesce(live);
	}

	dep = isl_union_map_copy(ps->dep_flow);
	dep = isl_union_map_reverse(dep);

	for (;;) {
		isl_union_set *extra;

		extra = isl_union_set_apply(isl_union_set_copy(live),
					    isl_union_map_copy(dep));
		if (isl_union_set_is_subset(extra, live)) {
			isl_union_set_free(extra);
			break;
		}

		live = isl_union_set_union(live, extra);
		live = isl_union_set_affine_hull(live);
		live = isl_union_set_intersect(live,
					    isl_union_set_copy(ps->domain));
	}

	isl_union_map_free(dep);

	ps->domain = isl_union_set_intersect(ps->domain,
						isl_union_set_copy(live));
	ps->schedule = isl_schedule_intersect_domain(ps->schedule,
						isl_union_set_copy(live));
	ps->dep_flow = isl_union_map_intersect_range(ps->dep_flow,
						isl_union_set_copy(live));
	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
	live = isl_union_set_preimage_union_pw_multi_aff(live, tagger);
	ps->tagged_dep_flow = isl_union_map_intersect_range(ps->tagged_dep_flow,
						live);
}

/* Intersect "set" with the set described by "str", taking the NULL
 * string to represent the universal set.
 */
static __isl_give isl_set *set_intersect_str(__isl_take isl_set *set,
	const char *str)
{
	isl_ctx *ctx;
	isl_set *set2;

	if (!str)
		return set;

	ctx = isl_set_get_ctx(set);
	set2 = isl_set_read_from_str(ctx, str);
	set = isl_set_intersect(set, set2);

	return set;
}

static void *ppcg_scop_free(struct ppcg_scop *ps)
{
	if (!ps)
		return NULL;

	isl_set_free(ps->context);
	isl_union_set_free(ps->domain);
	isl_union_set_free(ps->call);
	isl_union_map_free(ps->tagged_reads);
	isl_union_map_free(ps->reads);
	isl_union_map_free(ps->live_in);
	isl_union_map_free(ps->tagged_may_writes);
	isl_union_map_free(ps->tagged_must_writes);
	isl_union_map_free(ps->may_writes);
	isl_union_map_free(ps->must_writes);
	isl_union_map_free(ps->live_out);
	isl_union_map_free(ps->tagged_must_kills);
	isl_union_map_free(ps->must_kills);
	isl_union_map_free(ps->tagged_dep_flow);
	isl_union_map_free(ps->dep_flow);
	isl_union_map_free(ps->dep_false);
	isl_union_map_free(ps->dep_forced);
	isl_union_map_free(ps->tagged_dep_order);
	isl_union_map_free(ps->dep_order);
	isl_schedule_free(ps->schedule);
	isl_union_pw_multi_aff_free(ps->tagger);
	isl_union_map_free(ps->independence);
	isl_id_to_ast_expr_free(ps->names);
	/* AutoSA Extended */
	isl_union_map_free(ps->tagged_dep_rar);
	isl_union_map_free(ps->dep_rar);
	isl_union_map_free(ps->tagged_dep_waw);
	isl_union_map_free(ps->dep_waw);
	/* AutoSA Extended */

	free(ps);

	return NULL;
}

/* Extract a ppcg_scop from a pet_scop.
 *
 * The constructed ppcg_scop refers to elements from the pet_scop
 * so the pet_scop should not be freed before the ppcg_scop.
 */
static struct ppcg_scop *ppcg_scop_from_pet_scop(struct pet_scop *scop,
	struct ppcg_options *options)
{
	int i;
	isl_ctx *ctx;
	struct ppcg_scop *ps;

	if (!scop)
		return NULL;

	ctx = isl_set_get_ctx(scop->context);

	ps = isl_calloc_type(ctx, struct ppcg_scop);
	if (!ps)
		return NULL;

	ps->names = collect_names(scop);
	ps->options = options;
	ps->start = pet_loc_get_start(scop->loc);
	ps->end = pet_loc_get_end(scop->loc);
	ps->context = isl_set_copy(scop->context);
	ps->context = set_intersect_str(ps->context, options->ctx);
	if (options->non_negative_parameters) {
		isl_space *space = isl_set_get_space(ps->context);
		isl_set *nn = isl_set_nat_universe(space);
		ps->context = isl_set_intersect(ps->context, nn);
	}
	ps->domain = collect_non_kill_domains(scop);
	ps->call = collect_call_domains(scop);
	ps->tagged_reads = pet_scop_get_tagged_may_reads(scop);
	ps->reads = pet_scop_get_may_reads(scop);
	ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop);
	ps->may_writes = pet_scop_get_may_writes(scop);
	ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop);
	ps->must_writes = pet_scop_get_must_writes(scop);
	ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop);
	ps->must_kills = pet_scop_get_must_kills(scop);
	ps->schedule = isl_schedule_copy(scop->schedule);
	ps->pet = scop;
	ps->independence = isl_union_map_empty(isl_set_get_space(ps->context));
	for (i = 0; i < scop->n_independence; ++i)
		ps->independence = isl_union_map_union(ps->independence,
			isl_union_map_copy(scop->independences[i]->filter));

	compute_tagger(ps);
	compute_dependences(ps);
	eliminate_dead_code(ps);

	if (!ps->context || !ps->domain || !ps->call || !ps->reads ||
	    !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills ||
	    !ps->must_kills || !ps->schedule || !ps->independence || !ps->names)
		return ppcg_scop_free(ps);

	return ps;
}

/* Internal data structure for ppcg_transform.
 */
struct ppcg_transform_data {
	struct ppcg_options *options;
	__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
		struct ppcg_scop *scop, void *user);
	void *user;
};

/* Should we print the original code?
 * That is, does "scop" involve any data dependent conditions or
 * nested expressions that cannot be handled by pet_stmt_build_ast_exprs?
 */
static int print_original(struct pet_scop *scop, struct ppcg_options *options)
{
	if (!pet_scop_can_build_ast_exprs(scop)) {
		if (options->debug->verbose)
			fprintf(stdout, "Printing original code because "
				"some index expressions cannot currently "
				"be printed\n");
		return 1;
	}

	if (pet_scop_has_data_dependent_conditions(scop)) {
		if (options->debug->verbose)
			fprintf(stdout, "Printing original code because "
				"input involves data dependent conditions\n");
		return 1;
	}

	return 0;
}

/* Callback for pet_transform_C_source that transforms
 * the given pet_scop to a ppcg_scop before calling the
 * ppcg_transform callback.
 *
 * If "scop" contains any data dependent conditions or if we may
 * not be able to print the transformed program, then just print
 * the original code.
 */
static __isl_give isl_printer *transform(__isl_take isl_printer *p,
	struct pet_scop *scop, void *user)
{
	struct ppcg_transform_data *data = user;
	struct ppcg_scop *ps;

	if (print_original(scop, data->options)) {
		p = pet_scop_print_original(scop, p);
		pet_scop_free(scop);
		return p;
	}

	scop = pet_scop_align_params(scop);
	ps = ppcg_scop_from_pet_scop(scop, data->options);

	p = data->transform(p, ps, data->user);

	ppcg_scop_free(ps);
	pet_scop_free(scop);

	return p;
}

/* Transform the C source file "input" by rewriting each scop
 * through a call to "transform".
 * The transformed C code is written to "out".
 *
 * This is a wrapper around pet_transform_C_source that transforms
 * the pet_scop to a ppcg_scop before calling "fn".
 */
int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
	struct ppcg_options *options,
	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
		struct ppcg_scop *scop, void *user), void *user)
{
	struct ppcg_transform_data data = { options, fn, user };
	return pet_transform_C_source(ctx, input, out, &transform, &data);
}

/* Check consistency of options.
 *
 * Return -1 on error.
 */
static int check_options(isl_ctx *ctx)
{
	struct options *options;//
	options = isl_ctx_peek_options(ctx, &options_args);
	if (!options)
		isl_die(ctx, isl_error_internal,
			"unable to find options", return -1);//
	if (options->ppcg->openmp &&
	    !isl_options_get_ast_build_atomic_upper_bound(ctx))
		isl_die(ctx, isl_error_invalid,
			"OpenMP requires atomic bounds", return -1);//
	return 0;
}

//int main(int argc, char **argv)
//{
//	int r;
//	isl_ctx *ctx;
//	struct options *options;
//
//	options = options_new_with_defaults();
//	assert(options);
//
//	ctx = isl_ctx_alloc_with_options(&options_args, options);
//	ppcg_options_set_target_defaults(options->ppcg);
//	isl_options_set_ast_build_detect_min_max(ctx, 1);
//	isl_options_set_ast_print_macro_once(ctx, 1);
//	isl_options_set_schedule_whole_component(ctx, 0);
//	isl_options_set_schedule_maximize_band_depth(ctx, 1);
//	isl_options_set_schedule_maximize_coincidence(ctx, 1);
//	pet_options_set_encapsulate_dynamic_control(ctx, 1);
//	argc = options_parse(options, argc, argv, ISL_ARG_ALL);
//
//	if (check_options(ctx) < 0)
//		r = EXIT_FAILURE;
//	else if (options->ppcg->target == PPCG_TARGET_CUDA)
//		r = generate_cuda(ctx, options->ppcg, options->input);
//	else if (options->ppcg->target == PPCG_TARGET_OPENCL)
//		r = generate_opencl(ctx, options->ppcg, options->input,
//				options->output);
//	else
//		r = generate_cpu(ctx, options->ppcg, options->input,
//				options->output);
//
//	isl_ctx_free(ctx);
//
//	return r;
//}

int autosa_main_wrap(int argc, char **argv)
{
	int r;
	isl_ctx *ctx;
	struct options *options;

	options = options_new_with_defaults();
	assert(options);

	ctx = isl_ctx_alloc_with_options(&options_args, options);
	ppcg_options_set_target_defaults(options->ppcg);
	isl_options_set_ast_build_detect_min_max(ctx, 1);
	isl_options_set_ast_print_macro_once(ctx, 1);
	isl_options_set_schedule_whole_component(ctx, 0);
	isl_options_set_schedule_maximize_band_depth(ctx, 1);
	isl_options_set_schedule_maximize_coincidence(ctx, 1);
	pet_options_set_encapsulate_dynamic_control(ctx, 1);
	argc = options_parse(options, argc, argv, ISL_ARG_ALL);

	if (check_options(ctx) < 0)
		r = EXIT_FAILURE;
	//else if (options->ppcg->target == PPCG_TARGET_CUDA)
	//	r = generate_cuda(ctx, options->ppcg, options->input);
	//else if (options->ppcg->target == PPCG_TARGET_OPENCL)
	//	r = generate_opencl(ctx, options->ppcg, options->input,
	//			options->output);
	//else if (options->ppcg->target == PPCG_TARGET_C)
	//	r = generate_cpu(ctx, options->ppcg, options->input,
	//			options->output);
	else if (options->ppcg->target == AUTOSA_TARGET_XILINX_HLS_C) 
	  r = generate_autosa_xilinx_hls_c(ctx, options->ppcg, options->input);
	else if (options->ppcg->target == AUTOSA_TARGET_INTEL_OPENCL)
	  r = generate_autosa_intel_opencl(ctx, options->ppcg, options->input);
	else if (options->ppcg->target == AUTOSA_TARGET_CATAPULT_HLS_C)
		r = generate_autosa_catapult_hls_c(ctx, options->ppcg, options->input);
	else if (options->ppcg->target == AUTOSA_TARGET_TAPA_CPP)
	  r = generate_autosa_tapa_cpp(ctx, options->ppcg, options->input);
//	else if (options->ppcg->target == AUTOSA_TARGET_T2S)
//	  r = generate_autosa_t2s(ctx, options->ppcg, options->input, 
//				options->output); // TODO: To fix
//	else if (options->ppcg->target == AUTOSA_TARGET_C)
//	  r = generate_autosa_cpu(ctx, options->ppcg, options->input); // TODO: to fix

	isl_ctx_free(ctx);

	return r;
}


================================================
FILE: src/ppcg.h
================================================
#ifndef PPCG_H
#define PPCG_H

#include <isl/schedule.h>
#include <isl/set.h>
#include <isl/id.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/id_to_ast_expr.h>
#include <pet.h>

#include "ppcg_options.h"

#define _DEBUG

#define DBGVAR(os, var)                                  \
  (os) << "DBG: " << __FILE__ << "(" << __LINE__ << ") " \
       << #var << " = [" << (var) << "]" << std::endl;

#define DBGSCHDNODE(os, node, ctx)                                    {\
  printf("%s(%d) Print schedule_node.\n", __FILE__, __LINE__);         \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_set_yaml_style(p_debug, ISL_YAML_STYLE_BLOCK); \
  p_debug = isl_printer_print_schedule_node(p_debug, node);            \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGSCHD(os, node, ctx)                                        {\
  printf("%s(%d) Print schedule.\n", __FILE__, __LINE__);              \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_set_yaml_style(p_debug, ISL_YAML_STYLE_BLOCK); \
  p_debug = isl_printer_print_schedule(p_debug, node);                 \
  p_debug = isl_printer_free(p_debug);                                 \
} 

#define DBGSET(os, set, ctx)                                          {\
  printf("%s(%d) Print set.\n", __FILE__, __LINE__);                   \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_set(p_debug, set);                       \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGSPACE(os, space, ctx)                                      {\
  printf("%s(%d) Print space.\n", __FILE__, __LINE__);                 \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_space(p_debug, space);                   \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGUSET(os, uset, ctx)                                        {\
  printf("%s(%d) Print union_set.\n", __FILE__, __LINE__);             \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_union_set(p_debug, uset);                \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGUMAP(os, umap, ctx)                                        {\
  printf("%s(%d) Print union_map.\n", __FILE__, __LINE__);             \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_union_map(p_debug, umap);                \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGMAP(os, map, ctx)                                          {\
  printf("%s(%d) Print map.\n", __FILE__, __LINE__);                   \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_map(p_debug, map);                       \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGBMAP(os, bmap, ctx)                                        {\
  printf("%s(%d) Print basic_map.\n", __FILE__, __LINE__);             \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_basic_map(p_debug, bmap);                \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGMA(os, ma, ctx)                                            {\
  printf("%s(%d) Print multi_aff.\n", __FILE__, __LINE__);             \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_multi_aff(p_debug, ma);                  \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGVEC(os, vec, ctx)                                          {\
  printf("%s(%d) Print vec.\n", __FILE__, __LINE__);                   \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_vec(p_debug, vec);                       \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGASTEXPR(os, astexpr, ctx)                                  {\
  printf("%s(%d) Print AST expr.\n", __FILE__, __LINE__);              \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_set_output_format(p_debug, ISL_FORMAT_C);      \
  p_debug = isl_printer_print_ast_expr(p_debug, astexpr);              \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGASTNODE(os, astnode, ctx)                                  {\
  printf("%s(%d) Print AST node.\n", __FILE__, __LINE__);              \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_set_output_format(p_debug, ISL_FORMAT_C);      \
  p_debug = isl_printer_print_ast_node(p_debug, astnode);              \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGMUPA(os, mupa, ctx)                                        {\
  printf("%s(%d) Print multi_union_pw_aff.\n", __FILE__, __LINE__);    \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_multi_union_pw_aff(p_debug, mupa);       \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGUPA(os, upa, ctx)                                          {\
  printf("%s(%d) Print union_pw_aff.\n", __FILE__, __LINE__);          \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_union_pw_aff(p_debug, upa);              \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGVAL(os, val, ctx)                                          {\
  printf("%s(%d) Print val.\n", __FILE__, __LINE__);                   \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_val(p_debug, val);                       \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGID(os, id, ctx)                                            {\
  printf("%s(%d) Print id.\n", __FILE__, __LINE__);                    \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_id(p_debug, id);                         \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#define DBGPWQPOLY(os, pwqpoly, ctx)                                  {\
  printf("%s(%d) Print id.\n", __FILE__, __LINE__);                    \
  isl_printer *p_debug = isl_printer_to_file(ctx, os);                 \
  p_debug = isl_printer_print_pw_qpolynomial(p_debug, pwqpoly);        \
  p_debug = isl_printer_print_str(p_debug, "\n");                      \
  p_debug = isl_printer_free(p_debug);                                 \
}

#ifdef __cplusplus
extern "C"
{
#endif

	const char *ppcg_base_name(const char *filename);
	int ppcg_extract_base_name(char *name, const char *input);

	/* Representation of the scop for use inside PPCG.
 *
 * "options" are the options specified by the user.
 * Some fields in this structure may depend on some of the options.
 *
 * "start" and "end" are file offsets of the corresponding program text.
 * "context" represents constraints on the parameters.
 * "domain" is the union of all iteration domains.
 * "call" contains the iteration domains of statements with a call expression.
 * "reads" contains all potential read accesses.
 * "tagged_reads" is the same as "reads", except that the domain is a wrapped
 *	relation mapping an iteration domain to a reference identifier
 * "live_in" contains the potential read accesses that potentially
 *	have no corresponding writes in the scop.
 * "may_writes" contains all potential write accesses.
 * "tagged_may_writes" is the same as "may_writes", except that the domain
 *	is a wrapped relation mapping an iteration domain
 *	to a reference identifier
 * "must_writes" contains all definite write accesses.
 * "tagged_must_writes" is the same as "must_writes", except that the domain
 *	is a wrapped relation mapping an iteration domain
 *	to a reference identifier
 * "live_out" contains the potential write accesses that are potentially
 *	not killed by any kills or any other writes.
 * "must_kills" contains all definite kill accesses.
 * "tagged_must_kills" is the same as "must_kills", except that the domain
 *	is a wrapped relation mapping an iteration domain
 *	to a reference identifier.
 *
 * "tagger" maps tagged iteration domains to the corresponding untagged
 *	iteration domain.
 *
 * "independence" is the union of all independence filters.
 *
 * "dep_flow" represents the potential flow dependences.
 * "tagged_dep_flow" is the same as "dep_flow", except that both domain and
 *	range are wrapped relations mapping an iteration domain to
 *	a reference identifier.  May be NULL if not computed.
 * "dep_false" represents the potential false (anti and output) dependences.
 * "dep_forced" represents the validity constraints that should be enforced
 *	even when live-range reordering is used.
 *	In particular, these constraints ensure that all live-in
 *	accesses remain live-in and that all live-out accesses remain live-out
 *	and that multiple potential sources for the same read are
 *	executed in the original order.
 * "dep_order"/"tagged_dep_order" represents the order dependences between
 *	the live range intervals in "dep_flow"/"tagged_dep_flow".
 *	It is only used if the live_range_reordering
 *	option is set.  Otherwise it is NULL.
 *	If "dep_order" is used, then "dep_false" only contains a limited
 *	set of anti and output dependences.
 * "schedule" represents the (original) schedule.
 *
 * "names" contains all variable names that are in use by the scop.
 * The names are mapped to a dummy value.
 *
 * "pet" is the original pet_scop.
 */
	struct ppcg_scop
	{
		struct ppcg_options *options;

		unsigned start;
		unsigned end;

		isl_set *context;
		isl_union_set *domain;
		isl_union_set *call;
		isl_union_map *tagged_reads;
		isl_union_map *reads;
		isl_union_map *live_in;
		isl_union_map *tagged_may_writes;
		isl_union_map *may_writes;
		isl_union_map *tagged_must_writes;
		isl_union_map *must_writes;
		isl_union_map *live_out;
		isl_union_map *tagged_must_kills;
		isl_union_map *must_kills;

		isl_union_pw_multi_aff *tagger;

		isl_union_map *independence;

		isl_union_map *dep_flow;
		isl_union_map *tagged_dep_flow;
		isl_union_map *dep_false;
		isl_union_map *dep_forced;
		isl_union_map *dep_order;
		isl_union_map *tagged_dep_order;
		isl_schedule *schedule;

		isl_id_to_ast_expr *names;

		struct pet_scop *pet;

		/* AutoSA Extended */
		isl_union_map *dep_rar;
		isl_union_map *tagged_dep_rar;
		isl_union_map *dep_waw;
		isl_union_map *tagged_dep_waw;
		/* AutoSA Extended */
	};

	int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop);
	__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
																									 int n, const char *prefix);

	int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
										 struct ppcg_options *options,
										 __isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
																									 struct ppcg_scop *scop, void *user),
										 void *user);

	int autosa_main_wrap(int argc, char **argv);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/ppcg_files/cuda.c
================================================
/*
 * Copyright 2012      Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
 */

#include <isl/aff.h>
#include <isl/ast.h>

#include "cuda_common.h"
#include "cuda.h"
#include "gpu.h"
#include "gpu_print.h"
#include "print.h"
#include "util.h"

static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
{
	const char *macros =
		"#define cudaCheckReturn(ret) \\\n"
		"  do { \\\n"
		"    cudaError_t cudaCheckReturn_e = (ret); \\\n"
		"    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
		"      fprintf(stderr, \"CUDA error: %s\\n\", "
		"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
		"      fflush(stderr); \\\n"
		"    } \\\n"
		"    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
		"  } while(0)\n"
		"#define cudaCheckKernel() \\\n"
		"  do { \\\n"
		"    cudaCheckReturn(cudaGetLastError()); \\\n"
		"  } while(0)\n\n";

	p = isl_printer_print_str(p, macros);
	return p;
}

/* Print a declaration for the device array corresponding to "array" on "p".
 */
static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
	struct gpu_array_info *array)
{
	int i;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, array->type);
	p = isl_printer_print_str(p, " ");
	if (!array->linearize && array->n_index > 1)
		p = isl_printer_print_str(p, "(");
	p = isl_printer_print_str(p, "*dev_");
	p = isl_printer_print_str(p, array->name);
	if (!array->linearize && array->n_index > 1) {
		p = isl_printer_print_str(p, ")");
		for (i = 1; i < array->n_index; i++) {
			isl_ast_expr *bound;
			bound = isl_ast_expr_get_op_arg(array->bound_expr,
							1 + i);
			p = isl_printer_print_str(p, "[");
			p = isl_printer_print_ast_expr(p, bound);
			p = isl_printer_print_str(p, "]");
			isl_ast_expr_free(bound);
		}
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		if (!gpu_array_requires_device_allocation(&prog->array[i]))
			continue;

		p = declare_device_array(p, &prog->array[i]);
	}
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);
	return p;
}

static __isl_give isl_printer *allocate_device_arrays(
	__isl_take isl_printer *p, struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];

		if (!gpu_array_requires_device_allocation(&prog->array[i]))
			continue;
		p = ppcg_ast_expr_print_macros(array->bound_expr, p);
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p,
			"cudaCheckReturn(cudaMalloc((void **) &dev_");
		p = isl_printer_print_str(p, prog->array[i].name);
		p = isl_printer_print_str(p, ", ");
		p = gpu_array_info_print_size(p, &prog->array[i]);
		p = isl_printer_print_str(p, "));");
		p = isl_printer_end_line(p);
	}
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);
	return p;
}

static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		if (!gpu_array_requires_device_allocation(&prog->array[i]))
			continue;
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
		p = isl_printer_print_str(p, prog->array[i].name);
		p = isl_printer_print_str(p, "));");
		p = isl_printer_end_line(p);
	}

	return p;
}

/* Print code to "p" for copying "array" from the host to the device
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
	struct gpu_array_info *array)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	if (gpu_array_is_scalar(array))
		p = isl_printer_print_str(p, "&");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");

	p = gpu_array_info_print_size(p, array);
	p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
	p = isl_printer_end_line(p);

	return p;
}

/* Print code to "p" for copying "array" back from the device to the host
 * in its entirety.  The bounds on the extent of "array" have
 * been precomputed in extract_array_info and are used in
 * gpu_array_info_print_size.
 */
static __isl_give isl_printer *copy_array_from_device(
	__isl_take isl_printer *p, struct gpu_array_info *array)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
	if (gpu_array_is_scalar(array))
		p = isl_printer_print_str(p, "&");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", dev_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", ");
	p = gpu_array_info_print_size(p, array);
	p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
	p = isl_printer_end_line(p);

	return p;
}

static void print_reverse_list(FILE *out, int len, int *list)
{
	int i;

	if (!out || len == 0)
		return;

	fprintf(out, "(");
	for (i = 0; i < len; ++i) {
		if (i)
			fprintf(out, ", ");
		fprintf(out, "%d", list[len - 1 - i]);
	}
	fprintf(out, ")");
}

/* Print the effective grid size as a list of the sizes in each
 * dimension, from innermost to outermost.
 */
static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
	struct ppcg_kernel *kernel)
{
	int i;
	int dim;

	dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
	if (dim == 0)
		return p;

	p = isl_printer_print_str(p, "(");
	for (i = dim - 1; i >= 0; --i) {
		isl_ast_expr *bound;

		bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
		p = isl_printer_print_ast_expr(p, bound);
		isl_ast_expr_free(bound);

		if (i > 0)
			p = isl_printer_print_str(p, ", ");
	}

	p = isl_printer_print_str(p, ")");

	return p;
}

/* Print the grid definition.
 */
static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
	struct ppcg_kernel *kernel)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "dim3 k");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "_dimGrid");
	p = print_grid_size(p, kernel);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

/* Print the arguments to a kernel declaration or call.  If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the arrays accessed by the kernel
 * - the parameters
 * - the host loop iterators
 */
static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
	struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
{
	int i, n;
	int first = 1;
	unsigned nparam;
	isl_space *space;
	const char *type;

	for (i = 0; i < prog->n_array; ++i) {
		int required;

		required = ppcg_kernel_requires_array_argument(kernel, i);
		if (required < 0)
			return isl_printer_free(p);
		if (!required)
			continue;

		if (!first)
			p = isl_printer_print_str(p, ", ");

		if (types)
			p = gpu_array_info_print_declaration_argument(p,
				&prog->array[i], NULL);
		else
			p = gpu_array_info_print_call_argument(p,
				&prog->array[i]);

		first = 0;
	}

	space = isl_union_set_get_space(kernel->arrays);
	nparam = isl_space_dim(space, isl_dim_param);
	for (i = 0; i < nparam; ++i) {
		const char *name;

		name = isl_space_get_dim_name(space, isl_dim_param, i);

		if (!first)
			p = isl_printer_print_str(p, ", ");
		if (types)
			p = isl_printer_print_str(p, "int ");
		p = isl_printer_print_str(p, name);

		first = 0;
	}
	isl_space_free(space);

	n = isl_space_dim(kernel->space, isl_dim_set);
	type = isl_options_get_ast_iterator_type(prog->ctx);
	for (i = 0; i < n; ++i) {
		const char *name;

		if (!first)
			p = isl_printer_print_str(p, ", ");
		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
		if (types) {
			p = isl_printer_print_str(p, type);
			p = isl_printer_print_str(p, " ");
		}
		p = isl_printer_print_str(p, name);

		first = 0;
	}

	return p;
}

/* Print the header of the given kernel.
 */
static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
	struct gpu_prog *prog, struct ppcg_kernel *kernel)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "__global__ void kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "(");
	p = print_kernel_arguments(p, prog, kernel, 1);
	p = isl_printer_print_str(p, ")");

	return p;
}

/* Print the header of the given kernel to both gen->cuda.kernel_h
 * and gen->cuda.kernel_c.
 */
static void print_kernel_headers(struct gpu_prog *prog,
	struct ppcg_kernel *kernel, struct cuda_info *cuda)
{
	isl_printer *p;

	p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = print_kernel_header(p, prog, kernel);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);
	isl_printer_free(p);

	p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = print_kernel_header(p, prog, kernel);
	p = isl_printer_end_line(p);
	isl_printer_free(p);
}

static void print_indent(FILE *dst, int indent)
{
	fprintf(dst, "%*s", indent, "");
}

/* Print a list of iterators of type "type" with names "ids" to "out".
 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 * In particular, the last iterator is assigned the x identifier
 * (the first in the list of cuda identifiers).
 */
static void print_iterators(FILE *out, const char *type,
	__isl_keep isl_id_list *ids, const char *cuda_dims[])
{
	int i, n;

	n = isl_id_list_n_id(ids);
	if (n <= 0)
		return;
	print_indent(out, 4);
	fprintf(out, "%s ", type);
	for (i = 0; i < n; ++i) {
		isl_id *id;

		if (i)
			fprintf(out, ", ");
		id = isl_id_list_get_id(ids, i);
		fprintf(out, "%s = %s", isl_id_get_name(id),
			cuda_dims[n - 1 - i]);
		isl_id_free(id);
	}
	fprintf(out, ";\n");
}

static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	const char *type;
	const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
	const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
					"threadIdx.z" };

	type = isl_options_get_ast_iterator_type(ctx);

	print_iterators(out, type, kernel->block_ids, block_dims);
	print_iterators(out, type, kernel->thread_ids, thread_dims);
}

static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
	struct ppcg_kernel_var *var)
{
	int j;

	p = isl_printer_start_line(p);
	if (var->type == ppcg_access_shared)
		p = isl_printer_print_str(p, "__shared__ ");
	p = isl_printer_print_str(p, var->array->type);
	p = isl_printer_print_str(p, " ");
	p = isl_printer_print_str(p,  var->name);
	for (j = 0; j < var->array->n_index; ++j) {
		isl_val *v;

		p = isl_printer_print_str(p, "[");
		v = isl_vec_get_element_val(var->size, j);
		p = isl_printer_print_val(p, v);
		isl_val_free(v);
		p = isl_printer_print_str(p, "]");
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
	struct ppcg_kernel *kernel)
{
	int i;

	for (i = 0; i < kernel->n_var; ++i)
		p = print_kernel_var(p, &kernel->var[i]);

	return p;
}

/* Print a sync statement.
 */
static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "__syncthreads();");
	p = isl_printer_end_line(p);

	return p;
}

/* This function is called for each user statement in the AST,
 * i.e., for each kernel body statement, copy statement or sync statement.
 */
static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	struct ppcg_kernel_stmt *stmt;

	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);

	isl_ast_print_options_free(print_options);

	switch (stmt->type) {
	case ppcg_kernel_copy:
		return ppcg_kernel_print_copy(p, stmt);
	case ppcg_kernel_sync:
		return print_sync(p, stmt);
	case ppcg_kernel_domain:
		return ppcg_kernel_print_domain(p, stmt);
	}

	return p;
}

static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
	struct cuda_info *cuda)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	isl_ast_print_options *print_options;
	isl_printer *p;

	print_kernel_headers(prog, kernel, cuda);
	fprintf(cuda->kernel_c, "{\n");
	print_kernel_iterators(cuda->kernel_c, kernel);

	p = isl_printer_to_file(ctx, cuda->kernel_c);
	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = isl_printer_indent(p, 2);

	p = print_kernel_vars(p, kernel);
	p = isl_printer_end_line(p);
	p = ppcg_set_macro_names(p);
	p = gpu_print_macros(p, kernel->tree);

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
						    &print_kernel_stmt, NULL);
	p = isl_ast_node_print(kernel->tree, p, print_options);
	isl_printer_free(p);

	fprintf(cuda->kernel_c, "}\n");
}

/* Print code for initializing the device for execution of the transformed
 * code.  This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	p = print_cuda_macros(p);

	p = gpu_print_local_declarations(p, prog);
	p = declare_device_arrays(p, prog);
	p = allocate_device_arrays(p, prog);

	return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	p = free_device_arrays(p, prog);

	return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the gpu_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
{
	isl_ast_expr *expr, *arg;
	isl_id *id;
	const char *name;
	struct gpu_array_info *array;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	id = isl_ast_expr_get_id(arg);
	name = isl_id_get_name(id);
	array = isl_id_get_user(id);
	isl_id_free(id);
	isl_ast_expr_free(arg);
	isl_ast_expr_free(expr);

	if (!name)
		return isl_printer_free(p);
	if (!strcmp(name, "init_device"))
		return init_device(p, prog);
	if (!strcmp(name, "clear_device"))
		return clear_device(p, prog);
	if (!array)
		return isl_printer_free(p);

	if (!prefixcmp(name, "to_device"))
		return copy_array_to_device(p, array);
	else
		return copy_array_from_device(p, array);
}

struct print_host_user_data {
	struct cuda_info *cuda;
	struct gpu_prog *prog;
};

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the block and then launches the kernel.
 */
static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	int is_user;
	struct ppcg_kernel *kernel;
	struct ppcg_kernel_stmt *stmt;
	struct print_host_user_data *data;

	isl_ast_print_options_free(print_options);

	data = (struct print_host_user_data *) user;

	id = isl_ast_node_get_annotation(node);
	if (!id)
		return print_device_node(p, node, data->prog);

	is_user = !strcmp(isl_id_get_name(id), "user");
	kernel = is_user ? NULL : isl_id_get_user(id);
	stmt = is_user ? isl_id_get_user(id) : NULL;
	isl_id_free(id);

	if (is_user)
		return ppcg_kernel_print_domain(p, stmt);

	p = ppcg_start_block(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "dim3 k");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "_dimBlock");
	print_reverse_list(isl_printer_get_file(p),
				kernel->n_block, kernel->block_dim);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	p = print_grid(p, kernel);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, " <<<k");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "_dimGrid, k");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "_dimBlock>>> (");
	p = print_kernel_arguments(p, data->prog, kernel, 0);
	p = isl_printer_print_str(p, ");");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cudaCheckKernel();");
	p = isl_printer_end_line(p);

	p = ppcg_end_block(p);

	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);

	print_kernel(data->prog, kernel, data->cuda);

	return p;
}

static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
	struct cuda_info *cuda)
{
	isl_ast_print_options *print_options;
	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
	struct print_host_user_data data = { cuda, prog };

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
						&print_host_user, &data);

	p = gpu_print_macros(p, tree);
	p = isl_ast_node_print(tree, p, print_options);

	return p;
}

/* Given a gpu_prog "prog" and the corresponding transformed AST
 * "tree", print the entire CUDA code to "p".
 * "types" collects the types for which a definition has already
 * been printed.
 */
static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
	struct gpu_types *types, void *user)
{
	struct cuda_info *cuda = user;
	isl_printer *kernel;

	kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
	kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
	kernel = gpu_print_types(kernel, types, prog);
	isl_printer_free(kernel);

	if (!kernel)
		return isl_printer_free(p);

	p = print_host_code(p, prog, tree, cuda);

	return p;
}

/* Transform the code in the file called "input" by replacing
 * all scops by corresponding CUDA code.
 * The names of the output files are derived from "input".
 *
 * We let generate_gpu do all the hard work and then let it call
 * us back for printing the AST in print_cuda.
 *
 * To prepare for this printing, we first open the output files
 * and we close them after generate_gpu has finished.
 */
int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
	const char *input)
{
	struct cuda_info cuda;
	int r;

	cuda_open_files(&cuda, input);

	r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);

	cuda_close_files(&cuda);

	return r;
}


================================================
FILE: src/ppcg_files/cuda.h
================================================
#ifndef _CUDA_H
#define _CUDA_H

#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

	int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
										const char *input);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/ppcg_files/cuda_common.c
================================================
/*
 * Copyright 2010      INRIA Saclay
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 */

#include <ctype.h>
#include <limits.h>
#include <string.h>

#include "cuda_common.h"
#include "ppcg.h"

/* Open the host .cu file and the kernel .hu and .cu files for writing.
 * Add the necessary includes.
 */
void cuda_open_files(struct cuda_info *info, const char *input)
{
    char name[PATH_MAX];
    int len;

    len = ppcg_extract_base_name(name, input);

    strcpy(name + len, "_host.cu");
    info->host_c = fopen(name, "w");

    strcpy(name + len, "_kernel.cu");
    info->kernel_c = fopen(name, "w");

    strcpy(name + len, "_kernel.hu");
    info->kernel_h = fopen(name, "w");
    fprintf(info->host_c, "#include <assert.h>\n");
    fprintf(info->host_c, "#include <stdio.h>\n");
    fprintf(info->host_c, "#include \"%s\"\n", name);
    fprintf(info->kernel_c, "#include \"%s\"\n", name);
    fprintf(info->kernel_h, "#include \"cuda.h\"\n\n");
}

/* Close all output files.
 */
void cuda_close_files(struct cuda_info *info)
{
    fclose(info->kernel_c);
    fclose(info->kernel_h);
    fclose(info->host_c);
}


================================================
FILE: src/ppcg_files/cuda_common.h
================================================
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_

#include <stdio.h>

struct cuda_info
{
	FILE *host_c;
	FILE *kernel_c;
	FILE *kernel_h;
};

void cuda_open_files(struct cuda_info *info, const char *input);
void cuda_close_files(struct cuda_info *info);

#endif


================================================
FILE: src/ppcg_files/gpu.c
================================================
/*
 * Copyright 2010-2011 INRIA Saclay
 * Copyright 2012-2013 Ecole Normale Superieure
 * Copyright 2015-2016 Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 * and Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
 */

#include <stdlib.h>
#include <string.h>

#include <isl/polynomial.h>
#include <isl/union_set.h>
#include <isl/aff.h>
#include <isl/ilp.h>
#include <isl/flow.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <isl/options.h>
#include <isl/ast_build.h>

#include "cpu.h"
#include "gpu.h"
#include "gpu_array_tile.h"
#include "gpu_group.h"
#include "gpu_hybrid.h"
#include "gpu_tree.h"
#include "hybrid.h"
#include "schedule.h"
#include "ppcg_options.h"
#include "print.h"
#include "util.h"

struct gpu_array_info;

/* Return the name of the outer array (of structs) accessed by "access".
 */
static const char *get_outer_array_name(__isl_keep isl_map *access)
{
	isl_space *space;
	const char *name;

	space = isl_space_range(isl_map_get_space(access));
	while (space && isl_space_is_wrapping(space))
		space = isl_space_domain(isl_space_unwrap(space));
	name = isl_space_get_tuple_name(space, isl_dim_set);
	isl_space_free(space);

	return name;
}

/* Collect all references to the given array and store pointers to them
 * in array->refs.
 */
static isl_stat collect_references(struct gpu_prog *prog,
	struct gpu_array_info *array)
{
	int i;
	int n;

	n = 0;
	for (i = 0; i < prog->n_stmts; ++i) {
		struct gpu_stmt *stmt = &prog->stmts[i];
		struct gpu_stmt_access *access;

		for (access = stmt->accesses; access; access = access->next) {
			const char *name;
			name = get_outer_array_name(access->access);
			if (name && !strcmp(array->name, name))
				n++;
		}
	}

	array->refs = isl_alloc_array(prog->ctx, struct gpu_stmt_access *, n);
	if (!array->refs)
		return isl_stat_error;
	array->n_ref = n;

	n = 0;
	for (i = 0; i < prog->n_stmts; ++i) {
		struct gpu_stmt *stmt = &prog->stmts[i];
		struct gpu_stmt_access *access;

		for (access = stmt->accesses; access; access = access->next) {
			const char *name;
			name = get_outer_array_name(access->access);
			if (!name || strcmp(array->name, name))
				continue;

			array->refs[n++] = access;
		}
	}

	return isl_stat_ok;
}

/* Compute and return the extent of "array", taking into account the set of
 * accessed elements.
 *
 * In particular, the extent in the outer dimension is taken
 * from "accessed", while the extents in the remaining dimensions
 * are taken from array->extent.
 *
 * The extent in the outer dimension cannot be taken from array->extent
 * because that may be unbounded.  Furthermore, even if it is bounded,
 * it may be larger than the piece of the array that is being accessed.
 */
static __isl_give isl_set *compute_extent(struct pet_array *array,
	__isl_keep isl_set *accessed)
{
	int n_index;
	isl_id *id;
	isl_set *outer;
	isl_set *extent;

	extent = isl_set_copy(array->extent);

	n_index = isl_set_dim(accessed, isl_dim_set);
	if (n_index == 0)
		return extent;

	extent = isl_set_project_out(extent, isl_dim_set, 0, 1);
	outer = isl_set_copy(accessed);
	outer = isl_set_project_out(outer, isl_dim_set, 1, n_index - 1);
	extent = isl_set_flat_product(outer, extent);
	id = isl_set_get_tuple_id(accessed);
	extent = isl_set_set_tuple_id(extent, id);

	return extent;
}

/* Is the array "array" being extracted a read-only scalar?
 *
 * That is, is "array" a scalar that is never possibly written to.
 * An array containing structures is never considered to be a scalar.
 */
static int is_read_only_scalar(struct gpu_array_info *array,
	struct gpu_prog *prog)
{
	isl_set *space;
	isl_union_map *write;
	int empty;

	if (array->has_compound_element)
		return 0;
	if (array->n_index != 0)
		return 0;

	write = isl_union_map_copy(prog->may_write);
	space = isl_set_universe(isl_space_copy(array->space));
	write = isl_union_map_intersect_range(write,
						isl_union_set_from_set(space));
	empty = isl_union_map_is_empty(write);
	isl_union_map_free(write);

	return empty;
}

/* Is "array" only accessed as individual, fixed elements?
 * That is, does each access to "array" access a single, fixed element?
 */
static isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
{
	int i;

	for (i = 0; i < array->n_ref; ++i)
		if (!array->refs[i]->fixed_element)
			return isl_bool_false;

	return isl_bool_true;
}

/* Compute bounds on the host array "pa" based on the corresponding
 * accessed elements in "arrays"
 * and collect all references to the array.
 * Store the results in "info".
 *
 * If the array is zero-dimensional and does not contain structures,
 * i.e., if the array is a scalar, we check whether it is read-only.
 * We also check whether the array is accessed at all.
 */
static isl_stat extract_array_info(struct gpu_prog *prog,
	struct gpu_array_info *info, struct pet_array *pa,
	__isl_keep isl_union_set *arrays)
{
	int empty;
	const char *name;
	int n_index;
	isl_multi_pw_aff *bounds;
	isl_set *accessed, *extent;

	n_index = isl_set_dim(pa->extent, isl_dim_set);
	name = isl_set_get_tuple_name(pa->extent);

	info->space = isl_set_get_space(pa->extent);
	info->name = strdup(name);
	info->n_index = n_index;
	info->linearize = prog->scop->options->linearize_device_arrays;

	info->type = strdup(pa->element_type);
	info->size = pa->element_size;
	info->local = pa->declared && !pa->exposed;
	info->has_compound_element = pa->element_is_record;
	info->read_only_scalar = is_read_only_scalar(info, prog);

	info->declared_extent = isl_set_copy(pa->extent);
	accessed = isl_union_set_extract_set(arrays,
					    isl_space_copy(info->space));
	empty = isl_set_is_empty(accessed);
	extent = compute_extent(pa, accessed);
	isl_set_free(accessed);
	info->extent = extent;
	if (empty < 0)
		return isl_stat_error;
	info->accessed = !empty;
	bounds = ppcg_size_from_extent(isl_set_copy(extent));
	bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context));
	if (!bounds)
		return isl_stat_error;
	if (!isl_multi_pw_aff_is_cst(bounds))
		info->linearize = 1;
	info->bound = bounds;

	if (collect_references(prog, info) < 0)
		return isl_stat_error;
	info->only_fixed_element = only_fixed_element_accessed(info);

	return isl_stat_ok;
}

/* Remove independence from the order constraints "order" on array "array".
 * Since the pairs of iterations in the filter relation of an independence
 * are guaranteed to be completely independent by the user, there is
 * no need to ensure that live ranges are ordered along those pairs.
 * We make an exception for local variables, though, as the independence
 * guarantee does not apply to those.
 *
 * The order constraints are used in two places.
 * Those on scalars are used in check_scalar_live_ranges to check if
 * we need to force the scalar to be private.  Any non-local scalar
 * should not be forced scalar if it only appears in independent loops.
 * Those on non-scalars are added to the coincidence constraints
 * in compute_schedule because we do not support any array expansion.
 * Accesses to non-local arrays should not prevent a loop from being
 * considered coincident so we should indeed remove those constraints
 * from the order constraints.
 */
static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
	struct gpu_array_info *array, __isl_take isl_union_map *order)
{
	int i;

	for (i = 0; i < prog->scop->pet->n_independence; ++i) {
		struct pet_independence *pi = prog->scop->pet->independences[i];
		if (isl_union_set_contains(pi->local, array->space))
			continue;

		order = isl_union_map_subtract(order,
						isl_union_map_copy(pi->filter));
	}

	return order;
}

/* For each array in "prog", store the (untagged) order dependences
 * derived from the array in array->dep_order.
 * In particular, consider all references that access the given array
 * and take the order dependences that have one of these references
 * as source.  (Since an order dependence relates two references to
 * the same array, the target of these order dependences will also
 * be one of these references.)
 * Additionally, store the union of these array->dep_order relations
 * for all arrays that cannot be mapped to private memory in prog->array_order.
 */
void collect_order_dependences(struct gpu_prog *prog)
{
	int i;
	isl_space *space;
	isl_union_map *accesses;

	space = isl_union_map_get_space(prog->read);
	prog->array_order = isl_union_map_empty(space);

	accesses = isl_union_map_copy(prog->scop->tagged_reads);
	accesses = isl_union_map_union(accesses,
			    isl_union_map_copy(prog->scop->tagged_may_writes));
	accesses = isl_union_map_universe(accesses);
	accesses = isl_union_map_apply_range(accesses,
					    isl_union_map_copy(prog->to_outer));

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		isl_set *set;
		isl_union_set *uset;
		isl_union_map *order;

		set = isl_set_universe(isl_space_copy(array->space));
		uset = isl_union_set_from_set(set);
		uset = isl_union_map_domain(
		    isl_union_map_intersect_range(isl_union_map_copy(accesses),
						    uset));
		order = isl_union_map_copy(prog->scop->tagged_dep_order);
		order = isl_union_map_intersect_domain(order, uset);
		order = isl_union_map_zip(order);
		order = isl_union_set_unwrap(isl_union_map_domain(order));
		order = remove_independences(prog, array, order);
		array->dep_order = order;

		if (gpu_array_can_be_private(array))
			continue;

		prog->array_order = isl_union_map_union(prog->array_order,
					isl_union_map_copy(array->dep_order));
	}

	isl_union_map_free(accesses);
}

/* Construct a gpu_array_info for each array referenced by prog->scop and
 * collect them in prog->array.
 *
 * The sizes are based on the extents and the set of possibly accessed
 * elements by "prog".
 * If there are any member accesses involved, then they are first mapped
 * to the outer arrays of structs.
 * Only extract gpu_array_info entries for these outer arrays.
 *
 * If we are allowing live range reordering, then also set
 * the dep_order field.  Otherwise leave it NULL.
 */
static isl_stat collect_array_info(struct gpu_prog *prog)
{
	int i;
	isl_stat r = isl_stat_ok;
	isl_union_set *arrays;

	prog->n_array = 0;
	prog->array = isl_calloc_array(prog->ctx,
			     struct gpu_array_info, prog->scop->pet->n_array);
	if (!prog->array)
		return isl_stat_error;

	arrays = isl_union_map_range(isl_union_map_copy(prog->read));
	arrays = isl_union_set_union(arrays,
		    isl_union_map_range(isl_union_map_copy(prog->may_write)));

	arrays = isl_union_set_apply(arrays,
					isl_union_map_copy(prog->to_outer));

	arrays = isl_union_set_coalesce(arrays);

	for (i = 0; i < prog->scop->pet->n_array; ++i) {
		isl_bool field;

		field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent);
		if (field < 0)
			break;
		if (field)
			continue;
		if (extract_array_info(prog, &prog->array[prog->n_array++],
					prog->scop->pet->arrays[i], arrays) < 0)
			r = isl_stat_error;
	}
	if (i < prog->scop->pet->n_array)
		r = isl_stat_error;

	isl_union_set_free(arrays);

	if (prog->scop->options->live_range_reordering)
		collect_order_dependences(prog);

	return r;
}

static void free_array_info(struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		free(prog->array[i].type);
		free(prog->array[i].name);
		isl_multi_pw_aff_free(prog->array[i].bound);
		isl_ast_expr_free(prog->array[i].bound_expr);
		isl_space_free(prog->array[i].space);
		isl_set_free(prog->array[i].declared_extent);
		isl_set_free(prog->array[i].extent);
		isl_ast_expr_free(prog->array[i].declared_size);
		free(prog->array[i].refs);
		isl_union_map_free(prog->array[i].dep_order);
	}
	free(prog->array);
}

/* Check if a gpu array is a scalar.  A scalar is a value that is not stored
 * as an array or through a pointer reference, but as a single data element.
 * At the moment, scalars are represented as zero-dimensional arrays.
 * Note that the single data element may be an entire structure.
 */
int gpu_array_is_scalar(struct gpu_array_info *array)
{
	return array->n_index == 0;
}

/* Can "array" be mapped to private memory?
 * That is, is it only accessed as individual elements with
 * constant index expressions?
 */
isl_bool gpu_array_can_be_private(struct gpu_array_info *array)
{
	if (!array)
		return isl_bool_error;
	return array->only_fixed_element;
}

/* Is "array" a read-only scalar?
 */
int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
{
	return array->read_only_scalar;
}

/* Does "array" need to be allocated on the device?
 * If it is a read-only scalar, then it will be passed as an argument
 * to the kernel and therefore does not require any allocation.
 * If this device memory is not accessed at all, then it does not
 * need to be allocated either.
 */
int gpu_array_requires_device_allocation(struct gpu_array_info *array)
{
	if (gpu_array_is_read_only_scalar(array))
		return 0;
	if (!array->global)
		return 0;
	return 1;
}

/* Return the set of parameter values for which the array has a positive
 * size in all dimensions.
 * If the sizes are only valid for some parameter values, then those
 * constraints are also taken into account.
 */
__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array)
{
	int i;
	isl_space *space;
	isl_set *guard;

	if (!array)
		return NULL;

	space = isl_space_params(isl_space_copy(array->space));
	guard = isl_set_universe(space);

	for (i = 0; i < array->n_index; ++i) {
		isl_pw_aff *bound;
		isl_set *guard_i, *zero;

		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
		guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
		zero = isl_pw_aff_zero_set(bound);
		guard_i = isl_set_subtract(guard_i, zero);
		guard = isl_set_intersect(guard, guard_i);
	}

	return guard;
}

/* Internal data structure for extract_size_of_type.
 * "type" specifies the name of the space that we want to extract.
 * "res" is used to store the subset of that space.
 */
struct ppcg_extract_size_data {
	const char *type;
	isl_set *res;
};

/* This function is called for each set in a union_set.
 * If the name of the set matches data->type, we store the
 * set in data->res.
 */
static isl_stat extract_size_of_type(__isl_take isl_set *size, void *user)
{
	struct ppcg_extract_size_data *data = user;
	const char *name;

	name = isl_set_get_tuple_name(size);
	if (name && !strcmp(name, data->type)) {
		data->res = size;
		return isl_stat_error;
	}

	isl_set_free(size);
	return isl_stat_ok;
}

/* Given a union map { kernel[i] -> *[...] },
 * return the range in the space called "type" for the kernel with
 * sequence number "id".
 */
static __isl_give isl_set *extract_sizes(__isl_keep isl_union_map *sizes,
	const char *type, int id)
{
	isl_space *space;
	isl_set *dom;
	isl_union_set *local_sizes;
	struct ppcg_extract_size_data data = { type, NULL };

	if (!sizes)
		return NULL;

	space = isl_union_map_get_space(sizes);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, 1);
	space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
	dom = isl_set_universe(space);
	dom = isl_set_fix_si(dom, isl_dim_set, 0, id);

	local_sizes = isl_union_set_apply(isl_union_set_from_set(dom),
					isl_union_map_copy(sizes));
	isl_union_set_foreach_set(local_sizes, &extract_size_of_type, &data);
	isl_union_set_free(local_sizes);
	return data.res;
}

/* Given a singleton set, extract the first (at most *len) elements
 * of the single integer tuple into *sizes and update *len if needed.
 *
 * If "set" is NULL, then the "sizes" array is not updated.
 */
static isl_stat read_sizes_from_set(__isl_take isl_set *set, int *sizes,
	int *len)
{
	int i;
	int dim;

	if (!set)
		return isl_stat_ok;

	dim = isl_set_dim(set, isl_dim_set);
	if (dim < *len)
		*len = dim;

	for (i = 0; i < *len; ++i) {
		isl_val *v;

		v = isl_set_plain_get_val_if_fixed(set, isl_dim_set, i);
		if (!v)
			goto error;
		sizes[i] = isl_val_get_num_si(v);
		isl_val_free(v);
	}

	isl_set_free(set);
	return isl_stat_ok;
error:
	isl_set_free(set);
	return isl_stat_error;
}

/* Add the map { kernel[id] -> type[sizes] } to gen->used_sizes,
 * if the option debug->dump_sizes is set.
 */
static void set_used_sizes(struct gpu_gen *gen, const char *type, int id,
	int *sizes, int len)
{
	int i;
	isl_space *space;
	isl_map *map;

	if (!gen->options->debug->dump_sizes)
		return;

	space = isl_union_map_get_space(gen->used_sizes);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, 1);
	space = isl_space_set_tuple_name(space, isl_dim_set, "kernel");
	space = isl_space_from_domain(space);
	space = isl_space_add_dims(space, isl_dim_out, len);
	space = isl_space_set_tuple_name(space, isl_dim_out, type);

	map = isl_map_universe(space);
	map = isl_map_fix_si(map, isl_dim_in, 0, id);
	for (i = 0; i < len; ++i)
		map = isl_map_fix_si(map, isl_dim_out, i, sizes[i]);

	gen->used_sizes = isl_union_map_add_map(gen->used_sizes, map);
}

/* Extract user specified "tile" sizes from the "sizes" command line option,
 * defaulting to option->tile_size in each dimension.
 * *tile_len contains the maximum number of tile sizes needed.
 * Update *tile_len to the number of specified tile sizes, if any, and
 * return a pointer to the tile sizes (or NULL on error).
 * Add the effectively used sizes to gen->used_sizes.
 */
static int *read_tile_sizes(struct gpu_gen *gen, int *tile_len)
{
	int n;
	int *tile_size;
	isl_set *size;

	tile_size = isl_alloc_array(gen->ctx, int, *tile_len);
	if (!tile_size)
		return NULL;
	for (n = 0; n < *tile_len; ++n)
		tile_size[n] = gen->options->tile_size;

	size = extract_sizes(gen->sizes, "tile", gen->kernel_id);
	if (read_sizes_from_set(size, tile_size, tile_len) < 0)
		goto error;
	set_used_sizes(gen, "tile", gen->kernel_id, tile_size, *tile_len);

	return tile_size;
error:
	free(tile_size);
	return NULL;
}

/* Extract user specified "block" sizes from the "sizes" command line option,
 * after filling in some potentially useful defaults.
 */
static isl_stat read_block_sizes(struct ppcg_kernel *kernel,
	__isl_keep isl_union_map *sizes)
{
	isl_set *size;

	if (kernel->n_block > 3)
		kernel->n_block = 3;
	switch (kernel->n_block) {
	case 1:
		kernel->block_dim[0] = 512;
		break;
	case 2:
		kernel->block_dim[0] = 32;
		kernel->block_dim[1] = 16;
		break;
	default:
		kernel->block_dim[0] = 32;
		kernel->block_dim[1] = 4;
		kernel->block_dim[2] = 4;
		break;
	}

	size = extract_sizes(sizes, "block", kernel->id);
	return read_sizes_from_set(size, kernel->block_dim, &kernel->n_block);
}

/* Extract user specified "grid" sizes from the "sizes" command line option,
 * after filling in some potentially useful defaults.
 */
static isl_stat read_grid_sizes(struct ppcg_kernel *kernel,
	__isl_keep isl_union_map *sizes)
{
	isl_set *size;

	if (kernel->n_grid > 2)
		kernel->n_grid = 2;
	switch (kernel->n_grid) {
	case 1:
		kernel->grid_dim[0] = 32768;
		break;
	default:
		kernel->grid_dim[0] = 256;
		kernel->grid_dim[1] = 256;
		break;
	}

	size = extract_sizes(sizes, "grid", kernel->id);
	return read_sizes_from_set(size, kernel->grid_dim, &kernel->n_grid);
}

/* Extract user specified grid and block sizes from the gen->sizes
 * command line option after filling in some potentially useful defaults.
 * Store the extracted sizes in "kernel".
 * Add the effectively used sizes to gen->used_sizes.
 */
static isl_stat read_grid_and_block_sizes(struct ppcg_kernel *kernel,
	struct gpu_gen *gen)
{
	if (read_block_sizes(kernel, gen->sizes) < 0)
		return isl_stat_error;
	if (read_grid_sizes(kernel, gen->sizes) < 0)
		return isl_stat_error;
	set_used_sizes(gen, "block", kernel->id,
					    kernel->block_dim, kernel->n_block);
	set_used_sizes(gen, "grid", kernel->id,
					    kernel->grid_dim, kernel->n_grid);
	return isl_stat_ok;
}

static void *free_stmts(struct gpu_stmt *stmts, int n)
{
	int i;

	if (!stmts)
		return NULL;

	for (i = 0; i < n; ++i) {
		struct gpu_stmt_access *access, *next;

		for (access = stmts[i].accesses; access; access = next) {
			next = access->next;
			isl_id_free(access->ref_id);
			isl_map_free(access->access);
			isl_map_free(access->tagged_access);
			free(access);
		}

		isl_id_free(stmts[i].id);
	}
	free(stmts);

	return NULL;
}

/* Add parameters p[i] with identifiers "ids" to "set",
 * with bounds to 0 <= p[i] < size[i].
 */
__isl_give isl_set *add_bounded_parameters(__isl_take isl_set *set,
	int *size, __isl_keep isl_id_list *ids)
{
	int i, len;
	unsigned nparam;

	len = isl_id_list_n_id(ids);
	nparam = isl_set_dim(set, isl_dim_param);
	set = isl_set_add_dims(set, isl_dim_param, len);

	for (i = 0; i < len; ++i) {
		isl_id *id;

		id = isl_id_list_get_id(ids, i);
		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
		set = isl_set_lower_bound_si(set, isl_dim_param, nparam + i, 0);
		set = isl_set_upper_bound_si(set, isl_dim_param,
					    nparam + i, size[i] - 1);
	}

	return set;
}

/* Add "len" parameters p[i] with identifiers "ids" and intersect "set"
 * with
 *
 *	{ : 0 <= p[i] < size[i] }
 *
 * or an overapproximation.
 */
static __isl_give isl_set *add_bounded_parameters_dynamic(
	__isl_take isl_set *set, __isl_keep isl_multi_pw_aff *size,
	__isl_keep isl_id_list *ids)
{
	int i, len;
	unsigned nparam;
	isl_space *space;
	isl_local_space *ls;

	len = isl_multi_pw_aff_dim(size, isl_dim_out);
	nparam = isl_set_dim(set, isl_dim_param);
	set = isl_set_add_dims(set, isl_dim_param, len);

	for (i = 0; i < len; ++i) {
		isl_id *id;

		id = isl_id_list_get_id(ids, i);
		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
	}

	space = isl_space_params(isl_set_get_space(set));
	ls = isl_local_space_from_space(space);
	for (i = 0; i < len; ++i) {
		isl_pw_aff *param, *size_i, *zero;
		isl_set *bound;

		param = isl_pw_aff_var_on_domain(isl_local_space_copy(ls),
						isl_dim_param, nparam + i);

		size_i = isl_multi_pw_aff_get_pw_aff(size, i);
		bound = isl_pw_aff_lt_set(isl_pw_aff_copy(param), size_i);
		bound = isl_set_from_basic_set(isl_set_simple_hull(bound));
		set = isl_set_intersect_params(set, bound);

		zero = isl_pw_aff_zero_on_domain(isl_local_space_copy(ls));
		bound = isl_pw_aff_ge_set(param, zero);
		set = isl_set_intersect_params(set, bound);
	}
	isl_local_space_free(ls);

	return set;
}

/* Return the union of all tagged access relations in the group.
 */
static __isl_give isl_union_map *group_tagged_access_relation(
	struct gpu_array_ref_group *group)
{
	int i;
	isl_union_map *access;

	access = isl_union_map_empty(isl_map_get_space(group->access));
	for (i = 0; i < group->n_ref; ++i) {
		isl_map *map_i;

		map_i = isl_map_copy(group->refs[i]->tagged_access);
		access = isl_union_map_union(access,
					    isl_union_map_from_map(map_i));
	}

	return access;
}

/* Return the extent of "array", recomputed from the bounds.
 * The recomputed extent may be simpler than the original extent.
 */
static __isl_give isl_set *array_extent(struct gpu_array_info *array)
{
	int i;
	isl_id *id;
	isl_space *space;
	isl_local_space *ls;
	isl_set *extent;

	id = isl_set_get_tuple_id(array->extent);
	space = isl_set_get_space(array->extent);
	extent = isl_set_universe(isl_space_copy(space));
	ls = isl_local_space_from_space(space);
	for (i = 0; i < array->n_index; ++i) {
		isl_pw_aff *bound;
		isl_aff *aff;
		isl_pw_aff *index;
		isl_set *lt;

		extent = isl_set_lower_bound_si(extent, isl_dim_set, i, 0);

		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
						isl_dim_set, i);
		index = isl_pw_aff_from_aff(aff);
		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
		bound = isl_pw_aff_from_range(bound);
		bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
		bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
						isl_id_copy(id));
		lt = isl_pw_aff_lt_set(index, bound);
		extent = isl_set_intersect(extent, lt);
	}
	isl_local_space_free(ls);
	isl_id_free(id);

	return extent;
}

/* Return a map from the first group->shared_tile->depth dimensions
 * of the computed schedule to the array tile in
 * global memory that corresponds to the shared memory copy.
 *
 * In particular, return a map
 *
 *	{ D[i] -> A[a] }
 *
 * with constraints
 *
 *	tile_offset(i) <= a <= tile_offset(i) + tile_size - 1		(1)
 *
 * and
 *
 *	0 <= a <= array_size - 1					(2)
 *
 * Note that if some stride has been detected (i.e., when
 * group->shared_tile->bound[i].shift is set), then a in (1) refers
 * to the shifted and scaled down version.
 *
 * Constraints (1) are obtained by mapping the size constraints on the
 * shared/private memory tile back to the access relation.
 * Constraints (2) are obtained from the (recomputed) extent.
 */
static __isl_give isl_map *group_tile(struct gpu_array_ref_group *group)
{
	int i;
	int n_index = group->array->n_index;
	isl_map *tile;
	isl_space *space;
	isl_set *local;
	isl_set *extent;

	space = isl_multi_aff_get_space(group->shared_tile->tiling);
	space = isl_space_range(space);
	local = isl_set_universe(space);
	for (i = 0; i < n_index; ++i) {
		isl_val *bound;

		local = isl_set_lower_bound_si(local, isl_dim_set, i, 0);
		bound = isl_val_copy(group->shared_tile->bound[i].size);
		bound = isl_val_sub_ui(bound, 1);
		local = isl_set_upper_bound_val(local, isl_dim_set, i, bound);
	}
	local = isl_set_preimage_multi_aff(local,
				isl_multi_aff_copy(group->shared_tile->tiling));
	tile = isl_set_unwrap(local);
	extent = array_extent(group->array);
	tile = isl_map_intersect_range(tile, extent);

	return tile;
}

/* Given a mapping "iterator_map" from the AST schedule to a domain,
 * return the corresponding mapping from the AST schedule
 * to the outer kernel->copy_schedule_dim dimensions of
 * the schedule computed by PPCG for this kernel.
 *
 * Note that kernel->copy_schedule_dim is at least as large as
 * the largest depth of any array reference group associated to the kernel.
 * This is needed as the returned schedule is used to extract a mapping
 * to the outer tile->depth dimensions in transform_index.
 */
static __isl_give isl_pw_multi_aff *compute_sched_to_copy(
	struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map)
{
	isl_union_pw_multi_aff *upma;
	isl_pw_multi_aff *pma;
	isl_space *space;

	space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
	space = isl_space_from_domain(space);
	space = isl_space_add_dims(space, isl_dim_out,
					kernel->copy_schedule_dim);

	upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule);
	pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space);
	isl_union_pw_multi_aff_free(upma);

	return isl_pw_multi_aff_pullback_pw_multi_aff(pma, iterator_map);
}

/* If max_shared_memory is not set to infinity (-1), then make
 * sure that the total amount of shared memory required by the
 * array reference groups mapped to shared memory by "kernel"
 * is no larger than this maximum.
 *
 * We apply a greedy approach and discard (keep in global memory)
 * those groups that would result in a total memory size that
 * is larger than the maximum.
 *
 * This function should be called after any function that may
 * affect the decision on whether to place a reference group
 * in private, shared or global memory.
 */
static void check_shared_memory_bound(struct ppcg_kernel *kernel)
{
	int i, j;
	isl_val *left, *size;

	if (kernel->options->max_shared_memory < 0)
		return;

	left = isl_val_int_from_si(kernel->ctx,
				    kernel->options->max_shared_memory);

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *local = &kernel->array[i];

		for (j = 0; j < local->n_group; ++j) {
			struct gpu_array_ref_group *group;
			enum ppcg_group_access_type type;

			group = local->groups[j];
			type = gpu_array_ref_group_type(group);
			if (type != ppcg_access_shared)
				continue;

			size = gpu_array_tile_size(group->shared_tile);
			size = isl_val_mul_ui(size, local->array->size);

			if (isl_val_le(size, left)) {
				left = isl_val_sub(left, size);
				continue;
			}
			isl_val_free(size);

			group->shared_tile =
					gpu_array_tile_free(group->shared_tile);
		}
	}

	isl_val_free(left);
}

/* Mark all arrays of "kernel" that have an array reference group
 * that is not mapped to private or shared memory as
 * accessing the corresponding global device memory.
 */
static void mark_global_arrays(struct ppcg_kernel *kernel)
{
	int i, j;

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *local = &kernel->array[i];

		if (local->global)
			continue;
		for (j = 0; j < local->n_group; ++j) {
			if (gpu_array_ref_group_tile(local->groups[j]))
				continue;

			local->global = 1;
			local->array->global = 1;
			break;
		}
	}
}

/* Compute a tiling for all the array reference groups in "kernel".
 */
static void compute_group_tilings(struct ppcg_kernel *kernel)
{
	int i, j;

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j)
			gpu_array_ref_group_compute_tiling(array->groups[j]);
	}
}

/* Compute the effective grid size as a list of the sizes in each dimension.
 *
 * The grid size specified by the user or set by default
 * in read_grid_sizes() and applied by the block filter,
 * may be too large for the given code in the sense that
 * it may contain blocks that don't need to execute anything.
 * We therefore don't return this grid size, but instead the
 * smallest grid size that ensures that all blocks that actually
 * execute code are included in the grid.
 *
 * We first extract a description of the grid, i.e., the possible values
 * of the block ids, from the domain elements in "domain" and
 * kernel->block_filter.
 * The block ids are parameters in kernel->block_filter.
 * We simply need to change them into set dimensions.
 *
 * Then, for each block dimension, we compute the maximal value of the block id
 * and add one.
 */
static __isl_give isl_multi_pw_aff *extract_grid_size(
	struct ppcg_kernel *kernel, __isl_take isl_union_set *domain)
{
	int i;
	isl_set *grid;
	isl_set *context;
	isl_multi_pw_aff *size;

	domain = isl_union_set_intersect(domain,
				    isl_union_set_copy(kernel->block_filter));
	grid = isl_union_set_params(domain);
	grid = isl_set_from_params(grid);
	grid = isl_set_add_dims(grid, isl_dim_set, kernel->n_grid);
	for (i = 0; i < kernel->n_grid; ++i) {
		int pos;
		isl_id *id;

		if (!grid)
			return NULL;

		id = isl_id_list_get_id(kernel->block_ids, i);
		pos = isl_set_find_dim_by_id(grid, isl_dim_param, id);
		isl_id_free(id);
		if (pos < 0)
			isl_die(isl_set_get_ctx(grid), isl_error_internal,
				"missing constraints on block identifier",
				grid = isl_set_free(grid));
		grid = isl_set_equate(grid, isl_dim_param, pos, isl_dim_set, i);
		grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
	}

	grid = isl_set_coalesce(grid);
	size = ppcg_size_from_extent(grid);
	context = isl_set_params(isl_set_copy(kernel->context));
	return isl_multi_pw_aff_gist(size, context);
}

/* Compute the size of a fixed bounding box around the origin and "set",
 * where "set" is assumed to contain only non-negative elements,
 * and store the results in "size".
 * In particular, compute the maximal value of "set" in each direction
 * and add one.
 */
static void extract_fixed_size(__isl_take isl_set *set, int *size)
{
	int i, n;
	isl_local_space *ls;
	isl_aff *obj;

	n = isl_set_dim(set, isl_dim_set);
	ls = isl_local_space_from_space(isl_set_get_space(set));
	obj = isl_aff_zero_on_domain(ls);
	for (i = 0; i < n; ++i) {
		isl_val *max;

		obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 1);
		max = isl_set_max_val(set, obj);
		size[i] = isl_val_get_num_si(max) + 1;
		isl_val_free(max);
		obj = isl_aff_set_coefficient_si(obj, isl_dim_in, i, 0);
	}
	isl_aff_free(obj);
	isl_set_free(set);
}

/* Compute the effective block size as a list of the sizes in each dimension
 * and store the sizes in kernel->block_dim.
 *
 * The block size specified by the user or set by default
 * in read_block_sizes() and applied by the thread filter,
 * may be too large for the given code in the sense that
 * it may contain threads that don't need to execute anything.
 * We therefore update this block size in kernel->block_dim
 * to the smallest block size that ensures that all threads
 * that actually execute code are included in the block.
 *
 * The set of possible values of the thread ids is obtained from
 * the domain elements "domain" and kernel->thread_filter.
 * The current implementation eliminates all parameters, ensuring
 * that the size is a fixed constant in each dimension.
 * In principle we could also compute parametric sizes.
 * We would have to make sure to project out all b%d and t%d parameters,
 * however.
 */
static isl_stat extract_block_size(struct ppcg_kernel *kernel,
	__isl_take isl_union_set *domain)
{
	int i;
	int nparam;
	isl_set *block;

	domain = isl_union_set_intersect(domain,
				    isl_union_set_copy(kernel->thread_filter));
	block = isl_union_set_params(domain);
	block = isl_set_from_params(block);
	block = isl_set_add_dims(block, isl_dim_set, kernel->n_block);
	for (i = 0; i < kernel->n_block; ++i) {
		int pos;
		isl_id *id;

		if (!block)
			return isl_stat_error;

		id = isl_id_list_get_id(kernel->thread_ids, i);
		pos = isl_set_find_dim_by_id(block, isl_dim_param, id);
		isl_id_free(id);
		if (pos < 0)
			isl_die(isl_set_get_ctx(block), isl_error_internal,
				"missing constraints on thread identifier",
				block = isl_set_free(block));
		block = isl_set_equate(block, isl_dim_param, pos,
					isl_dim_set, i);
	}
	nparam = isl_set_dim(block, isl_dim_param);
	block = isl_set_project_out(block, isl_dim_param, 0, nparam);

	if (!block)
		return isl_stat_error;

	extract_fixed_size(block, kernel->block_dim);

	return isl_stat_ok;
}

struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel)
{
	int i, j;

	if (!kernel)
		return NULL;

	isl_id_list_free(kernel->block_ids);
	isl_id_list_free(kernel->thread_ids);
	isl_multi_pw_aff_free(kernel->grid_size);
	isl_ast_expr_free(kernel->grid_size_expr);
	isl_set_free(kernel->context);
	isl_union_set_free(kernel->core);
	isl_union_set_free(kernel->arrays);
	isl_union_pw_multi_aff_free(kernel->contraction);
	isl_union_set_free(kernel->expanded_domain);
	isl_space_free(kernel->space);
	isl_ast_node_free(kernel->tree);
	isl_union_set_free(kernel->block_filter);
	isl_union_set_free(kernel->thread_filter);
	isl_union_pw_multi_aff_free(kernel->copy_schedule);
	isl_union_set_free(kernel->sync_writes);

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j)
			gpu_array_ref_group_free(array->groups[j]);
		free(array->groups);

		isl_multi_pw_aff_free(array->bound);
		isl_ast_expr_free(array->bound_expr);
	}
	free(kernel->array);

	for (i = 0; i < kernel->n_var; ++i) {
		free(kernel->var[i].name);
		isl_vec_free(kernel->var[i].size);
	}
	free(kernel->var);

	free(kernel);

	return NULL;
}

/* Wrapper around ppcg_kernel_free for use as a isl_id_set_free_user callback.
 */
static void ppcg_kernel_free_wrap(void *user)
{
	struct ppcg_kernel *kernel = user;

	ppcg_kernel_free(kernel);
}

static void create_kernel_var(isl_ctx *ctx, struct gpu_array_ref_group *group,
	struct ppcg_kernel_var *var)
{
	int j;
	struct gpu_array_tile *tile;
	isl_printer *p;

	var->array = group->array;

	var->type = gpu_array_ref_group_type(group);
	tile = gpu_array_ref_group_tile(group);

	p = isl_printer_to_str(ctx);
	p = gpu_array_ref_group_print_name(group, p);
	var->name = isl_printer_get_str(p);
	isl_printer_free(p);

	var->size = isl_vec_alloc(ctx, group->array->n_index);

	for (j = 0; j < group->array->n_index; ++j)
		var->size = isl_vec_set_element_val(var->size, j,
					    isl_val_copy(tile->bound[j].size));
}

static isl_stat create_kernel_vars(struct ppcg_kernel *kernel)
{
	int i, j, n;

	n = 0;
	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j) {
			struct gpu_array_ref_group *group = array->groups[j];
			enum ppcg_group_access_type type;

			type = gpu_array_ref_group_type(group);
			if (type != ppcg_access_global)
				++n;
		}
	}

	kernel->var = isl_calloc_array(kernel->ctx, struct ppcg_kernel_var, n);
	if (!kernel->var)
		return isl_stat_error;
	kernel->n_var = n;

	n = 0;
	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j) {
			struct gpu_array_ref_group *group = array->groups[j];
			enum ppcg_group_access_type type;

			type = gpu_array_ref_group_type(group);
			if (type == ppcg_access_global)
				continue;
			create_kernel_var(kernel->ctx, group, &kernel->var[n]);
			++n;
		}
	}

	return isl_stat_ok;
}

/* Replace "pa" by the zero function defined over the universe domain
 * in the space of "pa".
 */
static __isl_give isl_pw_aff *set_universally_zero(__isl_take isl_pw_aff *pa)
{
	isl_space *space;
	isl_aff *zero;

	space = isl_space_domain(isl_pw_aff_get_space(pa));
	isl_pw_aff_free(pa);
	zero = isl_aff_zero_on_domain(isl_local_space_from_space(space));

	return isl_pw_aff_from_aff(zero);
}

/* The sizes of the arrays on the host that have been computed by
 * extract_array_info may depend on the parameters.  Use the extra
 * constraints on the parameters that are valid at "host_domain"
 * to simplify these expressions and store the results in kernel->array.
 *
 * We only need these localized bounds for arrays that are accessed
 * by the current kernel.  If we have found at least one reference group
 * then the array is accessed by the kernel.
 *
 * The resulting sizes may be functions that are nowhere defined
 * in case the access function cannot possibly access anything inside
 * the kernel for some reason.  If so, they are replaced by the zero
 * function.  Since the access function cannot actually access anything,
 * there is no harm in printing the array sizes as zero.
 */
static void localize_bounds(struct ppcg_kernel *kernel,
	__isl_keep isl_set *host_domain)
{
	int i, j;
	isl_set *context;

	context = isl_set_copy(host_domain);
	context = isl_set_params(context);

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *local = &kernel->array[i];
		isl_multi_pw_aff *bound;
		int n_index;

		if (local->n_group == 0)
			continue;

		n_index = local->array->n_index;
		bound = isl_multi_pw_aff_copy(local->array->bound);

		for (j = 0; j < n_index; ++j) {
			isl_pw_aff *pwaff;
			int empty;

			pwaff = isl_multi_pw_aff_get_pw_aff(bound, j);
			pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
			empty = isl_pw_aff_is_empty(pwaff);
			if (empty < 0)
				pwaff = isl_pw_aff_free(pwaff);
			else if (empty)
				pwaff = set_universally_zero(pwaff);
			bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff);
		}

		local->n_index = n_index;
		local->bound = bound;
	}
	isl_set_free(context);
}

/* Create the array of gpu_local_array_info structures "array"
 * inside "kernel".  The number of elements in this array is
 * the same as the number of arrays in "prog".
 * Initialize the "array" field of each local array to point
 * to the corresponding array in "prog".
 */
static struct ppcg_kernel *ppcg_kernel_create_local_arrays(
	struct ppcg_kernel *kernel, struct gpu_prog *prog)
{
	int i;
	isl_ctx *ctx;

	if (!kernel)
		return NULL;

	ctx = isl_set_get_ctx(prog->context);
	kernel->array = isl_calloc_array(ctx,
			    struct gpu_local_array_info, prog->n_array);
	if (!kernel->array)
		return ppcg_kernel_free(kernel);
	kernel->n_array = prog->n_array;

	for (i = 0; i < prog->n_array; ++i)
		kernel->array[i].array = &prog->array[i];

	return kernel;
}

/* Does "kernel" need to be passed an argument corresponding to array "i"?
 *
 * The argument is only needed if the kernel accesses this device memory.
 */
int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i)
{
	return kernel->array[i].global;
}

/* Find the element in gen->stmt that has the given "id".
 * Return NULL if no such gpu_stmt can be found.
 */
static struct gpu_stmt *find_stmt(struct gpu_prog *prog, __isl_keep isl_id *id)
{
	int i;

	for (i = 0; i < prog->n_stmts; ++i) {
		if (id == prog->stmts[i].id)
			break;
	}

	return i < prog->n_stmts ? &prog->stmts[i] : NULL;
}

void ppcg_kernel_stmt_free(void *user)
{
	struct ppcg_kernel_stmt *stmt = user;

	if (!stmt)
		return;

	switch (stmt->type) {
	case ppcg_kernel_copy:
		isl_ast_expr_free(stmt->u.c.index);
		isl_ast_expr_free(stmt->u.c.local_index);
		break;
	case ppcg_kernel_domain:
		isl_id_to_ast_expr_free(stmt->u.d.ref2expr);
		break;
	case ppcg_kernel_sync:
		break;
	}

	free(stmt);
}

/* Return the gpu_stmt_access in the list "accesses" that corresponds
 * to "ref_id".
 */
static struct gpu_stmt_access *find_access(struct gpu_stmt_access *accesses,
	__isl_keep isl_id *ref_id)
{
	struct gpu_stmt_access *access;

	for (access = accesses; access; access = access->next)
		if (access->ref_id == ref_id)
			return access;

	return NULL;
}

/* Return the index of the array called "name" in the list of arrays.
 */
static int find_array_index(struct ppcg_kernel *kernel, const char *name)
{
	int i;

	for (i = 0; i < kernel->n_array; ++i)
		if (!strcmp(name, kernel->array[i].array->name))
			return i;

	return -1;
}

/* Internal data structure for the index and AST expression transformation
 * callbacks for pet_stmt_build_ast_exprs.
 *
 * "kernel" is the kernel for which are computing AST expressions and
 * may be NULL if we are not inside a kernel.
 * "accesses" is the list of gpu_stmt_access in the statement.
 * "iterator_map" expresses the statement iterators in terms of
 * the AST loop iterators.
 * "sched2copy" expresses the outer copy_schedule_dim dimensions of
 * the kernel schedule in terms of the AST loop iterators and
 * may be NULL if we are not inside a kernel.
 *
 * The following fields are set in transform_index and used in transform_expr.
 * "array" is the array that is being accessed.
 * "global" is set if the global array is accessed (rather than
 * shared/private memory).
 * "local_array" refers to information on the array specialized
 * to the current kernel.
 */
struct ppcg_transform_data {
	struct ppcg_kernel *kernel;
	struct gpu_stmt_access *accesses;
	isl_pw_multi_aff *iterator_map;
	isl_pw_multi_aff *sched2copy;

	struct gpu_array_info *array;
	int global;
	struct gpu_local_array_info *local_array;
};

/* Return a pointer to the gpu_array_ref_group in "local"
 * that contains the reference "access".
 * Return NULL if no such group can be found.
 */
static struct gpu_array_ref_group *find_ref_group(
	struct gpu_local_array_info *local, struct gpu_stmt_access *access)
{
	int i, j;

	for (i = 0; i < local->n_group; ++i) {
		struct gpu_array_ref_group *group = local->groups[i];

		for (j = 0; j < group->n_ref; ++j)
			if (group->refs[j] == access)
				return group;
	}

	return NULL;
}

/* Given an index expression "index" of the form
 *
 *	L -> F(A),
 *
 * with F(A) either A or some subfield of A and L the AST loop iterators,
 * and a tiling "tiling" of the form
 *
 *	[L -> A] -> T
 *
 * apply the tiling to the outer array in the index expression to obtain
 *
 *	L -> T(A)
 *
 * If F(A) is some subfield of A, then separate the member access
 * into the base index expression and the field index expression,
 * apply the tiling to the base index expression and combine the result
 * with the field index expression.
 *
 * If F(A) is A, then modify index to keep track of the iterators
 *
 *	L -> [L -> A]
 *
 * and combine the result with the tiling to obtain a tiled index expression
 * in terms of the AST loop iterators
 *
 *	L -> T
 */
static __isl_give isl_multi_pw_aff *tile_outer(
	__isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling)
{
	isl_bool is_wrapping;
	isl_space *space;
	isl_multi_pw_aff *mpa;

	is_wrapping = isl_multi_pw_aff_range_is_wrapping(index);
	if (is_wrapping < 0)
		goto error;
	if (is_wrapping) {
		isl_multi_pw_aff *field;

		field = isl_multi_pw_aff_copy(index);
		field = isl_multi_pw_aff_range_factor_range(field);
		index = isl_multi_pw_aff_range_factor_domain(index);
		index = tile_outer(index, tiling);
		return isl_multi_pw_aff_range_product(index, field);
	}

	space = isl_space_domain(isl_multi_pw_aff_get_space(index));
	space = isl_space_map_from_set(space);
	mpa = isl_multi_pw_aff_identity(space);
	index = isl_multi_pw_aff_range_product(mpa, index);
	index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);

	return index;
error:
	isl_multi_pw_aff_free(index);
	isl_multi_pw_aff_free(tiling);
	return NULL;
}

/* Index transformation callback for pet_stmt_build_ast_exprs.
 *
 * "index" expresses the array indices in terms of statement iterators
 *
 * We first reformulate "index" in terms of the AST loop iterators.
 * Then we check if we are accessing the global array or
 * a shared/private copy.  In particular, if we are not inside a kernel
 * then we must be accessing a global array.
 * In the former case, we simply return
 * the updated index.  If "index" is an affine expression rather
 * than an array access, then we also return the updated index here.
 *
 * If no reference groups have been computed for the array,
 * then we can only be accessing the global array.
 *
 * Otherwise, we apply the tiling to the index.
 * This tiling is of the form
 *
 *	[D -> A] -> T
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule.
 * The index is of the form
 *
 *	L -> A
 *
 * We update the tiling to refer to the AST loop iterators
 *
 *	[L -> A] -> T
 *
 * and combine it with the index to obtain a tiled index expression in terms
 * of the AST loop iterators
 *
 *	L -> T
 *
 * Note that while the tiling applies directly to an outer array.
 * the index may refer to some subfield of this outer array.
 * In such cases, the result will refer to the same subfield of the tile.
 * That is, an index expression of the form  L -> F(A) will be transformed
 * into an index expression of the form L -> F(T).
 */
static __isl_give isl_multi_pw_aff *transform_index(
	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
	void *user)
{
	struct ppcg_transform_data *data = user;
	struct gpu_stmt_access *access;
	struct gpu_array_ref_group *group;
	struct gpu_array_tile *tile;
	isl_pw_multi_aff *iterator_map;
	int i;
	int dim;
	const char *name;
	isl_space *space;
	isl_multi_pw_aff *tiling;
	isl_pw_multi_aff *pma;
	isl_pw_multi_aff *sched2depth;

	data->array = NULL;

	iterator_map = isl_pw_multi_aff_copy(data->iterator_map);
	index = isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);

	if (!data->kernel)
		return index;

	access = find_access(data->accesses, ref_id);
	if (!access)
		return index;
	if (!isl_map_has_tuple_name(access->access, isl_dim_out))
		return index;

	name = get_outer_array_name(access->access);
	if (!name)
		return isl_multi_pw_aff_free(index);
	i = find_array_index(data->kernel, name);
	if (i < 0)
		isl_die(isl_multi_pw_aff_get_ctx(index), isl_error_internal,
			"cannot find array",
			return isl_multi_pw_aff_free(index));
	data->local_array = &data->kernel->array[i];
	data->array = data->local_array->array;

	group = find_ref_group(data->local_array, access);
	if (!group) {
		data->global = 1;
		return index;
	}

	tile = gpu_array_ref_group_tile(group);
	data->global = !tile;
	if (!tile)
		return index;

	space = isl_space_domain(isl_multi_aff_get_space(tile->tiling));
	space = isl_space_range(isl_space_unwrap(space));
	space = isl_space_map_from_set(space);
	pma = isl_pw_multi_aff_identity(space);
	sched2depth = isl_pw_multi_aff_copy(data->sched2copy);
	dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out);
	sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out,
					    tile->depth, dim - tile->depth);
	pma = isl_pw_multi_aff_product(sched2depth, pma);
	tiling = isl_multi_pw_aff_from_multi_aff(
				    isl_multi_aff_copy(tile->tiling));
	tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);

	index = tile_outer(index, tiling);

	return index;
}

/* Dereference "expr" by adding an index [0].
 * The original "expr" is assumed not to have any indices.
 *
 * If "expr" is a member access, then the dereferencing needs
 * to be applied to the structure argument of this member access.
 */
static __isl_give isl_ast_expr *dereference(__isl_take isl_ast_expr *expr)
{
	isl_ctx *ctx;
	isl_ast_expr *arg0, *res;
	isl_ast_expr_list *list;

	arg0 = isl_ast_expr_get_op_arg(expr, 0);
	if (!arg0)
		return isl_ast_expr_free(expr);
	if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
	    isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
		isl_ast_expr *arg;

		arg = isl_ast_expr_get_op_arg(arg0, 0);
		arg = dereference(arg);
		arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
		expr = isl_ast_expr_set_op_arg(expr, 0, arg0);

		return expr;
	}
	isl_ast_expr_free(arg0);

	ctx = isl_ast_expr_get_ctx(expr);
	res = isl_ast_expr_from_val(isl_val_zero(ctx));
	list = isl_ast_expr_list_from_ast_expr(res);
	res = isl_ast_expr_get_op_arg(expr, 0);
	res = isl_ast_expr_access(res, list);
	isl_ast_expr_free(expr);

	return res;
}

/* Linearize the index expression "expr" based on the array bounds
 * of "array".
 *
 * That is, transform expression
 *
 *	A[i_0][i_1]...[i_n]
 *
 * to
 *
 *	A[(..((i_0 * b_1 + i_1) ... ) * b_n + i_n]
 *
 * where b_0, b_1, ..., b_n are the bounds on the array.
 *
 * If the base of "expr" is a member access, then the linearization needs
 * to be applied to the structure argument of this member access.
 *
 * In the base case, if "expr" has no arguments (other than the name of
 * the array), then we are passing an entire array to a function.
 * In this case, there is nothing to linearize.
 * Note that at this point an expression with no arguments can
 * only be an entire array because the scalar case and
 * the case of single struct are handled by the caller.
 *
 * If the number of specified index expressions in "expr"
 * is smaller than the dimension of the accessed array,
 * then the missing i_j also do not appear in the linearized expression.
 * Furthermore, since such an expression does not refer to a single
 * element while the default linearized expression would refer to
 * a single element, we return the expression
 *
 *	A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l)
 *
 * instead.  Note that because of the special case handling above,
 * we can assume here that there is at least one index expression.
 */
__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
{
	int i, n;
	isl_ast_expr *arg0;
	isl_ast_expr *res;
	isl_ast_expr_list *list;

	arg0 = isl_ast_expr_get_op_arg(expr, 0);
	if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
	    isl_ast_expr_get_op_type(arg0) == isl_ast_op_member) {
		isl_ast_expr *arg;

		arg = isl_ast_expr_get_op_arg(arg0, 0);
		arg = gpu_local_array_info_linearize_index(array, arg);
		arg0 = isl_ast_expr_set_op_arg(arg0, 0, arg);
		expr = isl_ast_expr_set_op_arg(expr, 0, arg0);

		return expr;
	}
	isl_ast_expr_free(arg0);

	if (isl_ast_expr_get_op_n_arg(expr) == 1)
		return expr;

	n = isl_ast_expr_get_op_n_arg(expr);
	res = isl_ast_expr_get_op_arg(expr, 1);
	for (i = 1; i < array->n_index; ++i) {
		isl_ast_expr *expr_i;

		expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
		res = isl_ast_expr_mul(res, expr_i);

		if (i + 1 >= n)
			continue;
		expr_i = isl_ast_expr_get_op_arg(expr, i + 1);
		res = isl_ast_expr_add(res, expr_i);
	}

	if (1 + array->n_index > n) {
		res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
	} else {
		list = isl_ast_expr_list_from_ast_expr(res);
		res = isl_ast_expr_get_op_arg(expr, 0);
		res = isl_ast_expr_access(res, list);
	}

	isl_ast_expr_free(expr);

	return res;
}

/* AST expression transformation callback for pet_stmt_build_ast_exprs.
 *
 * If the AST expression refers to an array that is not accessed
 * at all, then this means the value of the expression is not used,
 * so we might as well print zero (NULL pointer) instead.
 *
 * If the AST expression refers to a global scalar that is not
 * a read-only scalar, then its address was passed to the kernel and
 * we need to dereference it.
 *
 * If the AST expression refers to an access to a global array,
 * then we linearize the access exploiting the bounds in data->local_array.
 */
static __isl_give isl_ast_expr *transform_expr(__isl_take isl_ast_expr *expr,
	__isl_keep isl_id *id, void *user)
{
	struct ppcg_transform_data *data = user;

	if (!data->array)
		return expr;
	if (!data->array->accessed) {
		isl_ctx *ctx;

		ctx = isl_ast_expr_get_ctx(expr);
		isl_ast_expr_free(expr);
		return isl_ast_expr_from_val(isl_val_zero(ctx));
	}
	if (gpu_array_is_read_only_scalar(data->array))
		return expr;
	if (!data->global)
		return expr;
	if (data->array->n_index == 0)
		return dereference(expr);
	if (!data->array->linearize)
		return expr;

	return gpu_local_array_info_linearize_index(data->local_array, expr);
}

/* This function is called for each instance of a user statement
 * in the kernel "kernel", identified by "gpu_stmt".
 * "kernel" may be NULL if we are not inside a kernel.
 *
 * We attach a struct ppcg_kernel_stmt to the "node", containing
 * a computed AST expression for each access, through an annotation
 * with name "user".
 * These AST expressions are computed from iterator_map,
 * which expresses the domain
 * elements in terms of the generated loops, and sched2copy,
 * which expresses the outer copy_schedule_dim dimensions of
 * the kernel schedule computed by PPCG in terms of the generated loops.
 */
static __isl_give isl_ast_node *create_domain_leaf(
	struct ppcg_kernel *kernel, __isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt)
{
	struct ppcg_transform_data data;
	struct ppcg_kernel_stmt *stmt;
	isl_ctx *ctx;
	isl_id *id;
	isl_pw_multi_aff *sched2copy;
	isl_map *map;
	isl_pw_multi_aff *iterator_map;
	isl_union_map *schedule;

	if (!node)
		return NULL;
	ctx = isl_ast_node_get_ctx(node);

	stmt = isl_calloc_type(ctx, struct ppcg_kernel_stmt);
	if (!stmt)
		return isl_ast_node_free(node);

	schedule = isl_ast_build_get_schedule(build);
	map = isl_map_reverse(isl_map_from_union_map(schedule));
	iterator_map = isl_pw_multi_aff_from_map(map);
	if (kernel)
		sched2copy = compute_sched_to_copy(kernel,
					isl_pw_multi_aff_copy(iterator_map));
	else
		sched2copy = NULL;

	stmt->type = ppcg_kernel_domain;
	stmt->u.d.stmt = gpu_stmt;

	data.kernel = kernel;
	data.accesses = stmt->u.d.stmt->accesses;
	data.iterator_map = iterator_map;
	data.sched2copy = sched2copy;
	stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
					    build, &transform_index, &data,
					    &transform_expr, &data);

	isl_pw_multi_aff_free(iterator_map);
	isl_pw_multi_aff_free(sched2copy);

	id = isl_id_alloc(ctx, "user", stmt);
	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
	if (!id)
		ppcg_kernel_stmt_free(stmt);
	return isl_ast_node_set_annotation(node, id);
}

/* This function is called for each statement node in the AST
 * for copying to or from shared/private memory.
 * Attach a pointer to a ppcg_kernel_stmt representing the copy
 * statement to the node.
 * The statement name is "read" or "write", depending on whether we are
 * reading from global memory or writing to global memory.
 *
 * The schedule is of the form
 *
 *	type[D -> A] -> L
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule, A to the global array and L to the outer
 * generated AST schedule.
 * We compute the inverse and strip off the type, resulting in
 *
 *	L -> [D -> A]
 *
 * We combine this mapping with on the one hand the projection
 *
 *	[D -> A] -> A
 *
 * and on the other hand the group tiling
 *
 *	[D -> A] -> T
 *
 * resulting in
 *
 *	L -> A		and 	L -> T
 *
 * and store the corresponding expressions in stmt->index and stmt->local_index,
 * where stmt points to the ppcg_kernel_stmt that is attached to the node.
 * stmt->index is linearized if the global memory array is linearized.
 */
static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel,
	struct gpu_array_ref_group *group, __isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build)
{
	struct ppcg_kernel_stmt *stmt;
	struct gpu_array_tile *tile;
	isl_id *id;
	isl_ast_expr *expr;
	isl_space *space;
	isl_map *access;
	isl_pw_multi_aff *pma, *pma2;
	const char *type;

	stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt);
	if (!stmt)
		return isl_ast_node_free(node);

	access = isl_map_from_union_map(isl_ast_build_get_schedule(build));
	type = isl_map_get_tuple_name(access, isl_dim_in);
	stmt->u.c.read = type && !strcmp(type, "read");
	access = isl_map_reverse(access);
	pma = isl_pw_multi_aff_from_map(access);
	pma = isl_pw_multi_aff_reset_tuple_id(pma, isl_dim_out);

	space = isl_space_range(isl_pw_multi_aff_get_space(pma));
	space = isl_space_unwrap(space);
	pma2 = isl_pw_multi_aff_range_map(space);
	pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
						    isl_pw_multi_aff_copy(pma));
	expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
	if (group->array->linearize)
		expr = gpu_local_array_info_linearize_index(group->local_array,
							    expr);
	stmt->u.c.index = expr;

	tile = gpu_array_ref_group_tile(group);
	pma2 = isl_pw_multi_aff_from_multi_aff(
					    isl_multi_aff_copy(tile->tiling));
	pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, pma);
	expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
	stmt->u.c.local_index = expr;

	stmt->u.c.array = group->array;
	stmt->u.c.local_array = group->local_array;
	stmt->type = ppcg_kernel_copy;

	id = isl_id_alloc(kernel->ctx, "copy", stmt);
	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
	if (!id)
		ppcg_kernel_stmt_free(stmt);
	return isl_ast_node_set_annotation(node, id);
}

/* Create a synchronization ppcg_kernel_stmt and
 * attach it to the node "node" representing the synchronization.
 */
static __isl_give isl_ast_node *create_sync_leaf(
	struct ppcg_kernel *kernel, __isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build)
{
	struct ppcg_kernel_stmt *stmt;
	isl_id *id;

	stmt = isl_calloc_type(kernel->ctx, struct ppcg_kernel_stmt);
	if (!stmt)
		return isl_ast_node_free(node);

	stmt->type = ppcg_kernel_sync;
	id = isl_id_alloc(kernel->ctx, "sync", stmt);
	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
	if (!id)
		ppcg_kernel_stmt_free(stmt);
	return isl_ast_node_set_annotation(node, id);
}

/* Build AST expressions for the device array sizes of all arrays in "prog"
 * that require allocation on the device using "build", as well as
 * for the original array sizes of all arrays that need to be declared
 * on the host.
 * "node" is freed in case of error.
 */
static __isl_give isl_ast_node *build_array_bounds(
	__isl_take isl_ast_node *node, struct gpu_prog *prog,
	__isl_keep isl_ast_build *build)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		isl_multi_pw_aff *size;
		isl_ast_expr *expr;

		if (!gpu_array_requires_device_allocation(array))
			continue;

		size = isl_multi_pw_aff_copy(array->bound);
		expr = ppcg_build_size_expr(size, build);
		array->bound_expr = expr;
		if (!expr)
			return isl_ast_node_free(node);
	}

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		isl_set *extent;
		isl_multi_pw_aff *size;
		isl_ast_expr *expr;

		if (!array->declare_local)
			continue;
		extent = isl_set_copy(array->declared_extent);
		size = ppcg_size_from_extent(extent);
		expr = ppcg_build_size_expr(size, build);
		array->declared_size = expr;
		if (!expr)
			return isl_ast_node_free(node);
	}

	return node;
}

/* Internal data structure for at_domain.
 *
 * "prog" represents the entire scop.
 * "kernel" points to the kernel to which the current schedule node
 * belongs.  It is set by before_mark and reset by after_mark.
 * It may be NULL if we are outside any kernel.
 */
struct ppcg_at_domain_data {
	struct gpu_prog *prog;
	struct ppcg_kernel *kernel;
};

/* This function is called for each instance of a user statement
 * in the kernel.  This may be one of the original user statements
 * or a statement introduced by PPCG.
 *
 * We first check if the statement id corresponds to a gpu statement,
 * which indicates the statement is an original user statement. Any statement
 * that is not an original user statement has been introduced by PPCG and
 * requires special handling.
 *
 * If the user statement is one of the original user statements, then we call
 * create_domain_leaf.  If it is "init_device", then we call
 * build_array_bounds.  Otherwise, we check if it is a copy or synchronization
 * statement and call the appropriate functions.  Statements that copy an array
 * to/from the device do not need any further treatment.
 * Neither does "clear_device".
 */
static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node,
	__isl_keep isl_ast_build *build, void *user)
{
	struct ppcg_at_domain_data *data = user;
	struct gpu_stmt *gpu_stmt;
	isl_ast_expr *expr, *arg;
	isl_id *id;
	int is_sync;
	const char *name;
	void *p;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	id = isl_ast_expr_get_id(arg);
	name = isl_id_get_name(id);
	p = isl_id_get_user(id);
	isl_ast_expr_free(expr);
	isl_ast_expr_free(arg);

	gpu_stmt = find_stmt(data->prog, id);
	is_sync = gpu_tree_id_is_sync(id, data->kernel);
	isl_id_free(id);

	if (gpu_stmt)
		return create_domain_leaf(data->kernel, node, build, gpu_stmt);

	if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_"))
		return node;
	if (!strcmp(name, "init_device"))
		return build_array_bounds(node, data->prog, build);
	if (!strcmp(name, "clear_device"))
		return node;
	if (is_sync < 0)
		return isl_ast_node_free(node);
	if (!strcmp(name, "read") || !strcmp(name, "write")) {
		struct gpu_array_ref_group *group = p;
		return create_access_leaf(data->kernel, group, node, build);
	}
	if (!is_sync)
		isl_die(data->prog->ctx, isl_error_internal,
			"unknown statement type",
			return isl_ast_node_free(node));
	return create_sync_leaf(data->kernel, node, build);
}

/* Given a set of wrapped references "ref", return the corresponding
 * access relations based on the tagged access relations "tagged".
 *
 * The elements of "ref" are of the form
 *
 *	[D -> R]
 *
 * with D an iteration domains and R a reference.
 * The elements of "tagged" are of the form
 *
 *	[D -> R] -> A
 *
 * with A an array.
 *
 * Extend "tagged" to include the iteration domain in the range, i.e.,
 *
 *	[D -> R] -> [D -> A]
 *
 * apply the result to "ref" and then unwrap the resulting set
 * to obtain relations of the form
 *
 *	D -> A
 */
static __isl_give isl_union_map *wrapped_reference_to_access(
	__isl_take isl_union_set *ref, __isl_take isl_union_map *tagged)
{
	isl_union_map *tag2access;

	tag2access = isl_union_map_copy(tagged);
	tag2access = isl_union_map_universe(tag2access);
	tag2access = isl_union_set_unwrap(isl_union_map_domain(tag2access));
	tag2access = isl_union_map_domain_map(tag2access);
	tag2access = isl_union_map_range_product(tag2access, tagged);

	ref = isl_union_set_coalesce(ref);
	ref = isl_union_set_apply(ref, tag2access);

	return isl_union_set_unwrap(ref);
}

/* Given an access relation "access" from one or more array reference groups,
 * remove those reads if ("read" is 1) or writes (if "read" is 0)
 * that are only needed to communicate data within
 * the same iteration of "sched".
 * The domain of "sched" corresponds to the original statement instances,
 * i.e., those that appear in the domains of the access relations.
 * "tagged" contains all tagged access relations to all
 * the array reference groups accessed by "access" from statement
 * instances scheduled by "sched".
 *
 * If the access is a read then it is either an element of
 *
 *	live_in union (range flow)
 *
 * where live_in and flow may be overapproximations, or
 * it reads an uninitialized value (that is not live-in because
 * there is an intermediate kill) or it reads a value that was
 * written within the same (compound) statement instance.
 * If the access is a write then it is either an element of
 *
 *	live_out union (domain flow)
 *
 * or it writes a value that is never read (and is not live-out
 * because of an intermediate kill) or only
 * within the same (compound) statement instance.
 * In both cases, the access relation is also a subset of
 * the group access relation.
 *
 * The cases where an uninitialized value is read or a value is written
 * that is never read or where the dataflow occurs within a statement
 * instance are also considered local and may also be removed.
 *
 * Essentially, we compute the intersection of "access" with either
 *
 *	live_in union (range non-local-flow)
 *
 * or
 *
 *	live_out union (domain non-local-flow)
 *
 * We first construct a relation "local"
 *
 *	[[D -> R] -> [D' -> R']]
 *
 * of pairs of domain iterations accessing the reference group
 * and references in the group that are coscheduled by "sched".
 *
 * If this relation does not intersect the dataflow dependences,
 * then there is nothing we can possibly remove, unless the dataflow
 * dependences themselves only relate a subset of the accesses.
 * In particular, the accesses may not be involved in any dataflow
 * dependences, either because they are uninitialized reads/dead writes
 * or because the dataflow occurs inside a statement instance.
 *
 * Since the computation below may break up the access relation
 * into smaller pieces, we only perform the intersection with
 * the non-local dependent accesses if the local pairs
 * intersect the dataflow dependences.  Otherwise, we intersect
 * with the universe of the non-local dependent accesses.
 * This should at least remove accesses from statements that
 * do not participate in any dependences.
 *
 * In particular, we remove the "local" dataflow dependences from
 * the set of all dataflow dependences, or at least those
 * that may contribute to a domain/range that intersects
 * the domain of "access".
 * Note that if the potential dataflow dependences are an overapproximation
 * of the actual dataflow dependences, then the result remains an
 * overapproximation of the non-local dataflow dependences.
 * Copying to/from global memory is only needed for the references
 * in the domain/range of the result or for accesses that are live out/in
 * for the entire scop.
 *
 * We therefore map the domain/range of the "external" relation
 * to the corresponding access relation and take the union with
 * the live out/in relation.
 */
static __isl_give isl_union_map *remove_local_accesses(
	struct gpu_prog *prog, __isl_take isl_union_map *tagged,
	__isl_take isl_union_map *access, __isl_take isl_union_map *sched,
	int read)
{
	int empty;
	isl_union_pw_multi_aff *tagger;
	isl_union_set *domain, *access_domain;
	isl_union_map *local, *external, *universe;
	isl_union_set *tag_set;

	if (isl_union_map_is_empty(access)) {
		isl_union_map_free(sched);
		isl_union_map_free(tagged);
		return access;
	}

	tagger = isl_union_pw_multi_aff_copy(prog->scop->tagger);
	domain = isl_union_map_domain(isl_union_map_copy(tagged));
	tagger = isl_union_pw_multi_aff_intersect_domain(tagger,
					isl_union_set_copy(domain));
	sched = isl_union_map_preimage_domain_union_pw_multi_aff(sched, tagger);

	local = isl_union_map_apply_range(sched,
			    isl_union_map_reverse(isl_union_map_copy(sched)));
	local = isl_union_map_intersect(local,
			isl_union_map_copy(prog->scop->tagged_dep_flow));

	empty = isl_union_map_is_empty(local);

	external = isl_union_map_copy(prog->scop->tagged_dep_flow);
	universe = isl_union_map_universe(isl_union_map_copy(access));
	access_domain = isl_union_map_domain(universe);
	domain = isl_union_set_universe(domain);
	universe = isl_union_set_unwrap(domain);
	universe = isl_union_map_intersect_domain(universe, access_domain);
	domain = isl_union_map_wrap(universe);
	if (read)
		external = isl_union_map_intersect_range(external, domain);
	else
		external = isl_union_map_intersect_domain(external, domain);
	external = isl_union_map_intersect_params(external,
				isl_set_copy(prog->scop->context));
	external = isl_union_map_subtract(external, local);

	if (read) {
		tag_set = isl_union_map_range(external);
		external = wrapped_reference_to_access(tag_set, tagged);
		external = isl_union_map_union(external,
				isl_union_map_copy(prog->scop->live_in));
	} else {
		tag_set = isl_union_map_domain(external);
		external = wrapped_reference_to_access(tag_set, tagged);
		external = isl_union_map_union(external,
				isl_union_map_copy(prog->scop->live_out));
	}

	if (empty < 0)
		external = isl_union_map_free(external);
	else if (empty)
		external = isl_union_map_universe(external);

	access = isl_union_map_intersect(access, external);

	return access;
}

/* Given an access relation "access" from "group", remove those reads
 * if ("read" is 1) or writes (if "read" is 0) that are only needed to
 * communicate data within the same iteration of the schedule "prefix"
 * at the position where the copying of the group is inserted.
 * That is, the output dimension of "prefix"
 * is equal to tile->depth.
 * The domain of "prefix" corresponds to the original statement instances,
 * i.e., those that appear in the domains of the access relations.
 *
 * Extract the tagged access relation of "group" and
 * then call remove_local_accesses.
 */
static __isl_give isl_union_map *remove_local_accesses_group(
	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
	__isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
	int read)
{
	isl_union_map *sched, *tagged;

	if (isl_union_map_is_empty(access))
		return access;

	tagged = group_tagged_access_relation(group);
	sched = isl_union_map_copy(prefix);

	return remove_local_accesses(kernel->prog, tagged, access, sched, read);
}

/* Build an access AST expression for the effective grid size using "build".
 * Store the result in kernel->grid_size_expr.
 */
static isl_stat build_grid_size(struct ppcg_kernel *kernel,
	__isl_keep isl_ast_build *build)
{
	isl_multi_pw_aff *size;

	size = isl_multi_pw_aff_copy(kernel->grid_size);
	size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid");
	kernel->grid_size_expr = ppcg_build_size_expr(size, build);

	if (!kernel->grid_size_expr)
		return isl_stat_error;
	return isl_stat_ok;
}

/* Build access AST expressions for the localized array sizes using "build".
 * Store the result in local->bound_expr.
 * Only do this for arrays for which localized bounds have been computed.
 */
static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel,
	__isl_keep isl_ast_build *build)
{
	int i;

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *local = &kernel->array[i];
		isl_multi_pw_aff *size;

		if (local->n_group == 0)
			continue;
		size = isl_multi_pw_aff_copy(local->bound);
		local->bound_expr = ppcg_build_size_expr(size, build);
		if (!local->bound_expr)
			return isl_stat_error;
	}

	return isl_stat_ok;
}

/* Build access AST expressions for the effective grid size and
 * the localized array sizes using "build".
 */
static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel,
	__isl_keep isl_ast_build *build)
{
	if (build_grid_size(kernel, build) < 0)
		return isl_stat_error;
	if (build_local_array_sizes(kernel, build) < 0)
		return isl_stat_error;
	return isl_stat_ok;
}

/* This function is called before the AST generator starts traversing
 * the schedule subtree of a node with mark "mark".
 *
 * If the mark is called "kernel", store the kernel pointer in data->kernel
 * for use in at_domain and build AST expressions for the grid size and
 * the localized array sizes.
 */
static isl_stat before_mark(__isl_keep isl_id *mark,
	__isl_keep isl_ast_build *build, void *user)
{
	struct ppcg_at_domain_data *data = user;

	if (!mark)
		return isl_stat_error;
	if (!strcmp(isl_id_get_name(mark), "kernel")) {
		data->kernel = isl_id_get_user(mark);
		if (build_grid_and_local_array_sizes(data->kernel, build) < 0)
			return isl_stat_error;
	}
	return isl_stat_ok;
}

/* This function is called after the AST generator has finished traversing
 * the schedule subtree of a mark node.  "node" points to the corresponding
 * mark AST node.
 *
 * If the mark is called "kernel", then replace "node" by a user node
 * that "calls" the kernel, representing the launch of the kernel.
 * The original "node" is stored inside the kernel object so that
 * it can be used to print the device code.
 * Note that this assumes that a kernel is only launched once.
 * Also clear data->kernel.
 */
static __isl_give isl_ast_node *after_mark(__isl_take isl_ast_node *node,
        __isl_keep isl_ast_build *build, void *user)
{
	isl_ctx *ctx;
	isl_id *id;
	isl_ast_expr *expr;
	isl_ast_expr_list *list;
	struct ppcg_kernel *kernel;
	struct ppcg_at_domain_data *data = user;

	ctx = isl_ast_node_get_ctx(node);
	id = isl_ast_node_mark_get_id(node);
	if (!id)
		return isl_ast_node_free(node);
	if (strcmp(isl_id_get_name(id), "kernel") || !data->kernel) {
		isl_id_free(id);
		return node;
	}
	kernel = data->kernel;
	data->kernel = NULL;
	kernel->space = isl_ast_build_get_schedule_space(build);
	kernel->tree = isl_ast_node_mark_get_node(node);
	isl_ast_node_free(node);

	expr = isl_ast_expr_from_id(isl_id_copy(id));
	list = isl_ast_expr_list_alloc(ctx, 0);
	expr = isl_ast_expr_call(expr, list);
	node = isl_ast_node_alloc_user(expr);
	node = isl_ast_node_set_annotation(node, id);

	return node;
}

static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
{
	int *depth = user;
	int node_depth;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
		return isl_bool_true;
	node_depth = isl_schedule_node_get_schedule_depth(node);
	if (node_depth > *depth)
		*depth = node_depth;

	return isl_bool_false;
}

/* Use isl to generate code for both the host and the device
 * from "schedule".
 * The device code is marked by "kernel" mark nodes in the schedule tree,
 * containing a pointer to a ppcg_kernel object.
 * The returned AST only contains the AST for the host code.
 * The ASTs for the device code are embedded in ppcg_kernel objects
 * attached to the leaf nodes that call "kernel".
 */
static __isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
	__isl_take isl_schedule *schedule)
{
	struct ppcg_at_domain_data data;
	isl_ast_build *build;
	isl_ast_node *tree;
	isl_id_list *iterators;
	int depth;

	data.prog = gen->prog;
	data.kernel = NULL;

	depth = 0;
	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
						&depth) < 0)
		schedule = isl_schedule_free(schedule);
	build = isl_ast_build_alloc(gen->prog->ctx);
	iterators = ppcg_scop_generate_names(gen->prog->scop, depth, "c");
	build = isl_ast_build_set_iterators(build, iterators);
	build = isl_ast_build_set_at_each_domain(build, &at_domain, &data);
	build = isl_ast_build_set_before_each_mark(build, &before_mark, &data);
	build = isl_ast_build_set_after_each_mark(build, &after_mark, &data);
	if (gen->prog->scop->options->debug->dump_final_schedule)
		isl_schedule_dump(schedule);
	tree = isl_ast_build_node_from_schedule(build, schedule);
	isl_ast_build_free(build);

	return tree;
}

__isl_give isl_union_map *extract_sizes_from_str(isl_ctx *ctx, const char *str)
{
	if (!str)
		return NULL;
	return isl_union_map_read_from_str(ctx, str);
}

/* Can "node" be tiled and then mapped to block and thread identifiers?
 * That is, is it permutable with at least one coincident dimension?
 */
static isl_bool is_permutable(__isl_keep isl_schedule_node *node)
{
	if (!node)
		return isl_bool_error;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		return isl_bool_false;
	if (!isl_schedule_node_band_get_permutable(node))
		return isl_bool_false;
	if (isl_schedule_node_band_n_member(node) < 1)
		return isl_bool_false;
	if (!isl_schedule_node_band_member_get_coincident(node, 0))
		return isl_bool_false;

	return isl_bool_true;
}

/* Is "node" not a suitably permutable band?
 */
static isl_bool not_permutable(__isl_keep isl_schedule_node *node, void *user)
{
	return isl_bool_not(is_permutable(node));
}

/* Does the subtree rooted at "node" have any suitably permutable band nodes?
 * That is, does it have any nodes that are permutable and that
 * have a least one coincident dimension?
 */
static isl_bool subtree_has_permutable_bands(__isl_keep isl_schedule_node *node)
{
	isl_bool all_non_permutable;

	all_non_permutable = isl_schedule_node_every_descendant(node,
						&not_permutable, NULL);
	return isl_bool_not(all_non_permutable);
}

/* Does "schedule" contain any permutable band with at least one coincident
 * member?
 */
static isl_bool has_any_permutable_node(__isl_keep isl_schedule *schedule)
{
	isl_schedule_node *root;
	isl_bool any_permutable;

	root = isl_schedule_get_root(schedule);
	any_permutable = subtree_has_permutable_bands(root);
	isl_schedule_node_free(root);

	return any_permutable;
}

/* Is "node" a candidate for mapping to block and thread identifiers?
 * In particular, is it permutable with at least one coincident dimension?
 * Alternatively, does the subtree rooted at "node" not contain
 * any such permutable node?  Filter nodes are skipped in this case,
 * because a band node will be inserted in front of the returned
 * node and this is not possible for filter nodes that are children
 * of set or sequence nodes.
 */
static int is_candidate(__isl_keep isl_schedule_node *node)
{
	isl_bool permutable;

	if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf)
		return 1;
	permutable = is_permutable(node);
	if (permutable < 0 || permutable)
		return permutable;
	if (isl_schedule_node_get_type(node) == isl_schedule_node_filter)
		return 0;
	permutable = subtree_has_permutable_bands(node);
	if (permutable < 0)
		return -1;
	return !permutable;
}

/* Is "node" the outermost node in its branch that can be tiled
 * and then mapped to block and thread identifiers?
 * If there are no such nodes in the subtree at "node" and
 * if "node" is not a filter node, then it is accepted too.
 */
static int is_outer_tilable(__isl_keep isl_schedule_node *node)
{
	int tilable;
	isl_schedule_node *ancestor;

	tilable = is_candidate(node);
	if (tilable < 0)
		return -1;
	if (!tilable)
		return 0;

	tilable = 0;
	ancestor = isl_schedule_node_copy(node);
	while (isl_schedule_node_has_parent(ancestor)) {
		ancestor = isl_schedule_node_parent(ancestor);

		tilable = is_candidate(ancestor);
		if (tilable < 0 || tilable)
			break;
	}

	isl_schedule_node_free(ancestor);
	return tilable < 0 ? -1 : !tilable;
}

/* Collect the references to all writes in "group".
 * Each reference is represented by a universe set in a space
 *
 *	[S[i,j] -> R[]]
 *
 * with S[i,j] the statement instance space and R[] the array reference.
 */
static __isl_give isl_union_set *group_tagged_writes(
	struct gpu_array_ref_group *group)
{
	int i;
	isl_space *space;
	isl_union_set *writes;

	space = isl_map_get_space(group->access);
	writes = isl_union_set_empty(space);
	for (i = 0; i < group->n_ref; ++i) {
		isl_space *space;
		isl_set *writes_i;

		if (!group->refs[i]->write)
			continue;

		space = isl_map_get_space(group->refs[i]->tagged_access);
		space = isl_space_domain(space);
		writes_i = isl_set_universe(space);
		writes = isl_union_set_add_set(writes, writes_i);
	}

	return writes;
}

/* Is there any write access in "group" that requires synchronization
 * on a write to global memory?
 * We currently take into account all writes that would require
 * synchronization at the thread level depth, but if the copying
 * for this group is performed at an outer level, then we do not
 * actually need to take into account dependences at intermediate levels.
 */
static int any_sync_writes_in_group(struct ppcg_kernel *kernel,
	struct gpu_array_ref_group *group)
{
	isl_union_set *writes;
	int empty, disjoint;

	empty = isl_union_set_is_empty(kernel->sync_writes);
	if (empty < 0)
		return -1;
	if (empty)
		return 0;

	writes = group_tagged_writes(group);
	disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes);
	isl_union_set_free(writes);

	return disjoint < 0 ? -1 : !disjoint;
}

/* Collect the references to all writes in "kernel" that write directly
 * to global or shared memory, i.e., that are not mapped to private memory.
 * Each reference is represented by a universe set in a space
 *
 *	[S[i,j] -> R[]]
 *
 * with S[i,j] the statement instance space and R[] the array reference.
 */
static __isl_give isl_union_set *collect_non_private_tagged_writes(
	struct ppcg_kernel *kernel)
{
	isl_union_set *writes;
	int i, j;

	writes = isl_union_set_empty(isl_union_set_get_space(kernel->arrays));

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j) {
			struct gpu_array_ref_group *group = array->groups[j];
			enum ppcg_group_access_type type;
			isl_union_set *writes_ij;

			if (!group->write)
				continue;
			type = gpu_array_ref_group_type(group);
			if (type == ppcg_access_private)
				continue;
			writes_ij = group_tagged_writes(group);
			writes = isl_union_set_union(writes, writes_ij);
		}
	}

	return writes;
}

/* Are there any direct writes to global memory that require
 * synchronization?
 */
static int any_global_or_shared_sync_writes(struct ppcg_kernel *kernel)
{
	isl_union_set *writes;
	int empty, disjoint;

	empty = isl_union_set_is_empty(kernel->sync_writes);
	if (empty < 0)
		return -1;
	if (empty)
		return 0;

	writes = collect_non_private_tagged_writes(kernel);
	disjoint = isl_union_set_is_disjoint(kernel->sync_writes, writes);
	isl_union_set_free(writes);

	return disjoint < 0 ? -1 : !disjoint;
}

/* Construct an isl_multi_val for use as tile sizes for tiling "node"
 * from the elements in "tile_size".
 */
static __isl_give isl_multi_val *construct_band_tiles_sizes(
	__isl_keep isl_schedule_node *node, int *tile_size)
{
	isl_space *space;

	if (!node)
		return NULL;

	space = isl_schedule_node_band_get_space(node);
	return ppcg_multi_val_from_int_list(space, tile_size);
}

/* Replace the partial schedule S of the band node "node" by
 *
 *	floor(S/f)
 *
 * or
 *
 *	f * floor(S/f)
 *
 * if scale_tile_loops is set, with f the integers in "factor".
 * The list that "factor" points to is assumed to contain at least
 * as many elements as the number of members in the band.
 */
static __isl_give isl_schedule_node *snap_band_to_sizes(
	__isl_take isl_schedule_node *node, int *factor,
	struct ppcg_options *options)
{
	isl_multi_val *mv;

	mv = construct_band_tiles_sizes(node, factor);
	node = isl_schedule_node_band_scale_down(node, isl_multi_val_copy(mv));
	if (options->scale_tile_loops)
		node = isl_schedule_node_band_scale(node,
							isl_multi_val_copy(mv));
	isl_multi_val_free(mv);

	return node;
}

/* Tile "band" with tile size specified by "sizes".
 *
 * Since the tile loops will be mapped to block ids, we forcibly
 * turn off tile loop scaling.  We may want to enable tile loop scaling
 * at some later point, but then we would have to support the detection
 * of strides during the mapping to block ids.
 * Similarly, since the point loops will be mapped to thread ids,
 * we forcibly shift the point loops so that they start at zero.
 */
static __isl_give isl_schedule_node *tile_band(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
{
	isl_ctx *ctx = isl_schedule_node_get_ctx(node);
	int scale_tile;
	int shift_point;

	scale_tile = isl_options_get_tile_scale_tile_loops(ctx);
	isl_options_set_tile_scale_tile_loops(ctx, 0);
	shift_point = isl_options_get_tile_shift_point_loops(ctx);
	isl_options_set_tile_shift_point_loops(ctx, 1);

	node = isl_schedule_node_band_tile(node, sizes);

	isl_options_set_tile_scale_tile_loops(ctx, scale_tile);
	isl_options_set_tile_shift_point_loops(ctx, shift_point);

	return node;
}

/* Extract the set of parameter values and outer schedule dimensions
 * for which any statement instance
 * in the kernel inserted at "node" needs to be executed.
 * Intersect the set of parameter values derived from the host schedule
 * relation with the context of "prog".
 */
static __isl_give isl_set *extract_context(__isl_keep isl_schedule_node *node,
	struct gpu_prog *prog)
{
	isl_union_map *schedule;
	isl_union_set *schedule_domain;
	isl_set *context;
	int empty;

	schedule = isl_schedule_node_get_prefix_schedule_relation(node);
	schedule_domain = isl_union_map_range(schedule);
	empty = isl_union_set_is_empty(schedule_domain);
	if (empty < 0) {
		isl_union_set_free(schedule_domain);
		return NULL;
	}
	if (empty) {
		int depth;
		isl_space *space;

		space = isl_union_set_get_space(schedule_domain);
		isl_union_set_free(schedule_domain);
		space = isl_space_set_from_params(space);
		depth = isl_schedule_node_get_schedule_depth(node);
		space = isl_space_add_dims(space, isl_dim_set, depth);
		context = isl_set_empty(space);
	} else {
		context = isl_set_from_union_set(schedule_domain);
	}
	context = isl_set_intersect_params(context,
					    isl_set_copy(prog->context));

	return context;
}

/* Return the set of outer array elements accessed by
 * by the statement instances in "domain" in "prog".
 * The instances in "domain" are those that appear
 * in the domains of the access relations in "prog".
 */
static __isl_give isl_union_set *accessed_by_domain(
	__isl_take isl_union_set *domain, struct gpu_prog *prog)
{
	isl_union_map *access;
	isl_union_set *arrays;

	access = isl_union_map_union(isl_union_map_copy(prog->read),
				     isl_union_map_copy(prog->may_write));
	access = isl_union_map_intersect_domain(access, domain);
	arrays = isl_union_map_range(access);
	arrays = isl_union_set_apply(arrays,
				isl_union_map_copy(prog->to_outer));

	return arrays;
}

/* Return the number of outer band members of the band node "node"
 * that are marked coincident.
 */
static int n_outer_coincidence(__isl_keep isl_schedule_node *node)
{
	int i, n;

	n = isl_schedule_node_band_n_member(node);

	for (i = 0; i < n; ++i)
		if (!isl_schedule_node_band_member_get_coincident(node, i))
			break;

	return i;
}

/* If the band node "node" has more than "n" members, then split off
 * the first "n" of them.
 */
static __isl_give isl_schedule_node *split_band(
	__isl_take isl_schedule_node *node, int n)
{
	int dim;

	dim = isl_schedule_node_band_n_member(node);
	if (n < dim)
		node = isl_schedule_node_band_split(node, n);

	return node;
}

/* Scale a band node that may have been split by split_band.
 * "sizes" are the scaling factors for the original node.
 * "node" either points to the original band node, or the outer
 * of the two pieces after splitting.
 *
 * If the number of elements in "node" is smaller than the number of
 * elements in "sizes", then some splitting has occurred and we split
 * "sizes" in the same way.
 */
static __isl_give isl_schedule_node *scale_band(
	__isl_take isl_schedule_node *node, __isl_take isl_multi_val *sizes)
{
	int n, dim;

	n = isl_multi_val_dim(sizes, isl_dim_set);
	dim = isl_schedule_node_band_n_member(node);
	if (n > dim) {
		isl_multi_val *sizes2;

		sizes2 = isl_multi_val_copy(sizes);
		sizes = isl_multi_val_drop_dims(sizes,
						isl_dim_set, dim, n - dim);
		sizes2 = isl_multi_val_drop_dims(sizes2, isl_dim_set, 0, dim);
		node = isl_schedule_node_child(node, 0);
		node = isl_schedule_node_band_scale(node, sizes2);
		node = isl_schedule_node_parent(node);
	}

	return isl_schedule_node_band_scale(node, sizes);
}

/* Return an isl_multi_aff, with as elements the parameters in "space"
 * that have the names specified by the elements in "names".
 * If (some of) these parameters do not already appear in "space",
 * then they are added first.
 */
static __isl_give isl_multi_aff *parameter_vector(__isl_take isl_space *space,
	__isl_keep isl_id_list *names)
{
	int i, n;
	isl_local_space *ls;
	isl_multi_aff *ma;

	if (!names)
		space = isl_space_free(space);

	n = isl_id_list_n_id(names);
	for (i = 0; i < n; ++i) {
		int pos;
		isl_id *id;

		id = isl_id_list_get_id(names, i);
		pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
		if (pos >= 0) {
			isl_id_free(id);
			continue;
		}
		pos = isl_space_dim(space, isl_dim_param);
		space = isl_space_add_dims(space, isl_dim_param, 1);
		space = isl_space_set_dim_id(space, isl_dim_param, pos, id);
	}
	ma = isl_multi_aff_zero(isl_space_copy(space));
	ls = isl_local_space_from_space(isl_space_domain(space));
	for (i = 0; i < n; ++i) {
		int pos;
		isl_id *id;
		isl_aff *aff;

		id = isl_id_list_get_id(names, i);
		pos = isl_space_find_dim_by_id(space, isl_dim_param, id);
		isl_id_free(id);
		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
					    isl_dim_param, pos);
		ma = isl_multi_aff_set_aff(ma, i, aff);
	}
	isl_local_space_free(ls);

	return ma;
}

/* Return constraints on the domain elements that equate a sequence of
 * parameters called "names", to the partial schedule
 * of "node" modulo the integers in "size".
 * The number of elements in the array "size" should be equal
 * to the number of elements in "names".
 * The number of members of the band node "node" should be smaller
 * than or equal to this number.  If it is smaller, then the first
 * elements of "names" are equated to zero.
 */
static __isl_give isl_union_set *set_schedule_modulo(
	__isl_keep isl_schedule_node *node, __isl_keep isl_id_list *names,
	int *size)
{
	int n, n_zero;
	isl_space *space;
	isl_multi_aff *ma;
	isl_multi_union_pw_aff *mupa, *mupa2;
	isl_multi_val *mv;
	isl_union_set *domain;

	if (!node)
		return NULL;
	n = isl_id_list_n_id(names);
	if (n == 0)
		return isl_schedule_node_get_universe_domain(node);
	n_zero = n - isl_schedule_node_band_n_member(node);

	mupa = isl_schedule_node_band_get_partial_schedule(node);
	mv = construct_band_tiles_sizes(node, size + n_zero);
	mupa = isl_multi_union_pw_aff_mod_multi_val(mupa, mv);

	space = isl_multi_union_pw_aff_get_space(mupa);
	space = isl_space_params(space);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, n_zero);
	ma = isl_multi_aff_zero(space);

	domain = isl_schedule_node_get_universe_domain(node);
	mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(
						isl_union_set_copy(domain), ma);
	mupa = isl_multi_union_pw_aff_range_product(mupa2, mupa);

	space = isl_multi_union_pw_aff_get_space(mupa);
	ma = parameter_vector(space, names);

	mupa2 = isl_multi_union_pw_aff_multi_aff_on_domain(domain, ma);
	mupa = isl_multi_union_pw_aff_sub(mupa, mupa2);

	return isl_multi_union_pw_aff_zero_union_set(mupa);
}

/* Insert a context node at "node" introducing the block and thread
 * identifiers along with their bounds, which are stored in kernel->grid_size
 * and kernel->block_dim.
 * Note that the bounds on the block identifiers may implicitly impose
 * constraints on the parameters.  A guard needs to be inserted
 * in the schedule tree to ensure that those bounds hold at "node".
 * This guard is inserted in insert_guard.
 */
static __isl_give isl_schedule_node *insert_context(struct ppcg_kernel *kernel,
	__isl_take isl_schedule_node *node)
{
	isl_set *context;

	context = isl_set_universe(isl_set_get_space(kernel->context));

	context = add_bounded_parameters_dynamic(context,
					kernel->grid_size, kernel->block_ids);
	context = add_bounded_parameters(context,
					kernel->block_dim, kernel->thread_ids);

	node = isl_schedule_node_insert_context(node, context);

	return node;
}

/* Insert a guard that eliminates kernel launches where the kernel
 * obviously does not have any work to do.
 *
 * In particular, eliminate kernel launches where there are obviously
 * zero blocks.
 * Use the same block size constraints that are used to create the context
 * to ensure that all constraints implicit in the constructed context
 * are imposed by the guard.
 *
 * Additionally, add other constraints that are valid
 * for each executed instance ("context"), as long as this does not result
 * in a disjunction.
 */
static __isl_give isl_schedule_node *insert_guard(
	__isl_take isl_schedule_node *node, __isl_keep isl_set *context,
	__isl_keep isl_multi_pw_aff *size, struct ppcg_scop *scop)
{
	unsigned nparam, n;
	isl_set *guard;
	isl_id_list *ids;

	guard = isl_set_copy(context);
	guard = isl_set_compute_divs(guard);
	guard = isl_set_from_basic_set(isl_set_simple_hull(guard));

	nparam = isl_set_dim(guard, isl_dim_param);
	n = isl_multi_pw_aff_dim(size, isl_dim_out);
	ids = ppcg_scop_generate_names(scop, n, "__ppcg_tmp");
	guard = add_bounded_parameters_dynamic(guard, size, ids);
	isl_id_list_free(ids);
	guard = isl_set_project_out(guard, isl_dim_param, nparam, n);

	node = isl_schedule_node_insert_guard(node, guard);

	return node;
}

/* Does any array reference group mapping require the band that is mapped
 * to threads to be unrolled?
 */
static int kernel_requires_unroll(struct ppcg_kernel *kernel)
{
	int i, j;

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j) {
			struct gpu_array_ref_group *group = array->groups[j];
			if (gpu_array_ref_group_requires_unroll(group))
				return 1;
		}
	}

	return 0;
}

/* Mark the given band node "node" for unrolling by the AST generator and
 * then sink it to the leaves of the schedule tree.
 * All dimensions of "node" are assumed to be coincident, such that this
 * sinking is a valid operation.
 */
static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node)
{
	node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);

	node = isl_schedule_node_band_sink(node);

	return node;
}

/* Insert a synchronization node in the schedule tree of "node"
 * after the core computation of "kernel" at the level of the band
 * that is mapped to threads, except if that level is equal to
 * that of the band that is mapped to blocks or if there are no writes
 * to global or shared memory in the core computation that require
 * synchronization.
 * If there are any writes to shared memory and the shared memory
 * copying is performed at the same level, then synchronization
 * is needed between the core and the copying anyway, so we might
 * as well add it here.  If the copying is performed at a higher
 * level, then different iterations of intermediate schedule dimensions
 * may have a different mapping from between shared memory elements and
 * threads, such that synchronization is required after the core.
 * "node" is assumed to point to the kernel node.
 *
 * If the shared and the thread mark point to the same node, then make
 * sure the synchronization is inserted outside of the shared mark.
 */
static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel,
	__isl_take isl_schedule_node *node)
{
	int depth;
	int need_sync;

	need_sync = any_global_or_shared_sync_writes(kernel);
	if (need_sync < 0)
		return isl_schedule_node_free(node);
	if (!need_sync)
		return node;

	node = gpu_tree_move_down_to_thread(node, kernel->core);
	depth = isl_schedule_node_get_schedule_depth(node);
	node = gpu_tree_move_up_to_kernel(node);
	if (depth == isl_schedule_node_get_schedule_depth(node))
		return node;

	node = gpu_tree_move_down_to_depth(node, depth, kernel->core);
	node = gpu_tree_ensure_following_sync(node, kernel);

	node = gpu_tree_move_up_to_kernel(node);

	return node;
}

/* Return a read ("read" is 1) or write access relation for "group"
 * with those accesses removed that are only needed to communicate data
 * within the subtree of the schedule rooted at "node".
 * Furthermore, include the prefix schedule at "node".
 * That is, return a relation of the form
 *
 *	S -> [D -> A]
 *
 * with D the outer schedule dimensions at "node".
 */
static __isl_give isl_union_map *anchored_non_local_accesses(
	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
	__isl_take isl_schedule_node *node, int read)
{
	isl_union_map *access;
	isl_union_map *prefix;

	prefix = isl_schedule_node_get_prefix_schedule_relation(node);
	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
			    isl_union_pw_multi_aff_copy(kernel->contraction));
	access = gpu_array_ref_group_access_relation(group, read, !read);
	access = remove_local_accesses_group(kernel, group, access, prefix,
						read);
	access = isl_union_map_range_product(prefix, access);

	return access;
}

/* Given an array reference group "group", create a mapping
 *
 *	read[D -> A] -> [D -> A]
 *
 * if "read" is set or
 *
 *	write[D -> A] -> [D -> A]
 *
 * if "read" is not set.
 * D corresponds to the outer tile->depth dimensions of
 * the kernel schedule.
 */
static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx,
	struct gpu_array_ref_group *group, int read)
{
	struct gpu_array_tile *tile;
	isl_space *space;
	isl_id *id;

	tile = gpu_array_ref_group_tile(group);
	space = isl_space_copy(group->array->space);
	space = isl_space_from_range(space);
	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
	space = isl_space_wrap(space);
	space = isl_space_map_from_set(space);

	id = isl_id_alloc(ctx, read ? "read" : "write", group);
	space = isl_space_set_tuple_id(space, isl_dim_in, id);

	return isl_multi_aff_identity(space);
}

/* If any writes in "group" require synchronization, then make sure
 * that there is a synchronization node for "kernel" after the node
 * following "node" in a sequence.
 *
 * If "shared" is set and no synchronization is needed for
 * the writes to global memory, then add synchronization before
 * the kernel to protect shared memory from being overwritten
 * by the next iteration of the core computation.
 * No additional synchronization is needed to protect against
 * the next copy into shared memory because each element of
 * the shared memory tile is always copied by the same thread.
 */
static __isl_give isl_schedule_node *add_group_write_sync(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel,
	struct gpu_array_ref_group *group, int shared)
{
	int need_sync;

	need_sync = any_sync_writes_in_group(kernel, group);
	if (need_sync < 0)
		return isl_schedule_node_free(node);
	if (need_sync) {
		node = isl_schedule_node_parent(node);
		node = isl_schedule_node_next_sibling(node);
		node = isl_schedule_node_child(node, 0);
		node = gpu_tree_ensure_following_sync(node, kernel);
	} else if (shared) {
		struct gpu_array_tile *tile;

		tile = gpu_array_ref_group_tile(group);
		node = isl_schedule_node_parent(node);
		node = isl_schedule_node_parent(node);
		node = gpu_tree_move_down_to_depth(node, tile->depth,
							kernel->core);
		node = gpu_tree_move_left_to_sync(node, kernel);
	}

	return node;
}

/* Add copy statements to the schedule tree of "node"
 * for reading from global memory to private memory (if "read" is set) or
 * for writing back from private memory to global memory
 * (if "read" is not set) for the array reference group "group" that
 * is mapped to private memory.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 *
 * The copies are performed in the order of the array elements.
 * The copy statement instances include a reference to the outer
 * tile->depth dimensions of the kernel schedule for ease of
 * combining them with the group tiling.
 *
 * That is, the extra schedule is of the form
 *
 *	type[D -> A] -> A
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule and A to the global array.
 * This schedule is unrolled because registers are not addressable.
 *
 * The copying is inserted in the schedule tree through an extension
 * of the form
 *
 *	D -> type[D -> A]
 *
 * where the extra domain elements type[D -> A] are those accessed
 * by the group.
 * A filter is inserted on type[D -> A] to ensure that the element
 * is read/written by the same thread that needs the element.
 * This filter is obtained by applying
 *
 *	S -> type[D -> A]
 *
 * to the thread filter for the core statements.
 *
 * The extension is inserted before the core computation in case of a read
 * and after the core computation in case of a write.
 * In the latter case, we also make sure that there is a synchronization
 * node after the write to global memory, unless this write is performed
 * at the outer level of the kernel.
 * In principle, this synchronization could be inserted higher
 * in the schedule tree depending on where the corresponding reads
 * from global memory are performed.
 */
static __isl_give isl_schedule_node *add_copies_group_private(
	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
	__isl_take isl_schedule_node *node, int read)
{
	struct gpu_array_tile *tile;
	isl_union_map *access;
	isl_union_set *domain;
	isl_space *space;
	isl_multi_aff *from_access;
	isl_multi_pw_aff *mpa;
	isl_multi_union_pw_aff *mupa;
	isl_union_pw_multi_aff *contraction;
	isl_schedule_node *graft;
	isl_union_set *filter;
	int kernel_depth;
	int empty;

	kernel_depth = isl_schedule_node_get_schedule_depth(node);
	tile = gpu_array_ref_group_tile(group);
	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);

	access = anchored_non_local_accesses(kernel, group, node, read);
	empty = isl_union_map_is_empty(access);
	if (empty < 0 || empty) {
		isl_union_map_free(access);
		if (empty < 0)
			return isl_schedule_node_free(node);
		return gpu_tree_move_up_to_kernel(node);
	}

	group->array->global = 1;
	group->local_array->global = 1;

	from_access = create_from_access(kernel->ctx, group, read);
	space = isl_space_domain(isl_multi_aff_get_space(from_access));
	access = isl_union_map_preimage_range_multi_aff(access, from_access);

	filter = isl_union_set_copy(kernel->thread_filter);
	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
	filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction);
	filter = isl_union_set_apply(filter, isl_union_map_copy(access));
	filter = isl_union_set_detect_equalities(filter);
	filter = isl_union_set_coalesce(filter);

	domain = isl_union_map_range(access);
	access = isl_union_set_wrapped_domain_map(domain);
	access = isl_union_map_reverse(access);
	access = isl_union_map_coalesce(access);
	graft = isl_schedule_node_from_extension(access);

	space = isl_space_map_from_set(space);
	mpa = isl_multi_pw_aff_identity(space);
	mpa = isl_multi_pw_aff_range_factor_range(mpa);
	mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

	graft = isl_schedule_node_child(graft, 0);
	graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
	graft = unroll(graft);

	graft = isl_schedule_node_insert_filter(graft, filter);

	graft = isl_schedule_node_parent(graft);

	if (read)
		node = isl_schedule_node_graft_before(node, graft);
	else {
		node = isl_schedule_node_graft_after(node, graft);
		if (kernel_depth < tile->depth)
			node = add_group_write_sync(node, kernel, group, 0);
	}

	node = gpu_tree_move_up_to_kernel(node);

	return node;
}

/* Add copy statements to the schedule tree of "node"
 * for reading from global memory to shared memory (if "read" is set) or
 * for writing back from shared memory to global memory
 * (if "read" is not set) for the array reference group "group" that
 * is mapped to shared memory.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 *
 * The copies are performed in the order of the corresponding shared
 * memory tile.
 * The copy statement instances include a reference to the outer
 * tile->depth dimensions of the kernel schedule for ease of
 * combining them with the group tiling.
 *
 * If we are performing a read from global memory to shared memory and
 * if the array involved is not a scalar, then we copy
 * the entire tile to shared memory.  This may result in some extra
 * elements getting copied, but it should lead to simpler code
 * (which means that fewer registers may be needed) and less divergence.
 *
 * Otherwise, we only copy the elements that will be read or have been written
 * in the kernel.
 *
 * That is, the extra schedule is of the form
 *
 *	type[D -> A] -> T
 *
 * where D corresponds to the outer tile->depth dimensions of
 * the kernel schedule, A to the global array and T is the corresponding
 * shared memory tile.
 *
 * The copying is inserted in the schedule tree through an extension
 * of the form
 *
 *	D -> type[D -> A]
 *
 * where the extra domain elements type[D -> A] are those accessed
 * by the group.  In the case of read from a non-scalar, this set
 * is replaced by the entire shared memory tile.
 *
 * If the "unroll_copy_shared" option is set, then the AST generator
 * is instructed to unroll the copying code.
 *
 * A filter is inserted on type[D -> A] to map the copy instances
 * to the threads.  In particular, the thread identifiers are
 * equated to the position inside the shared memory tile (T)
 * modulo the block size.
 * We try to align the innermost tile dimension with the innermost
 * thread identifier (x) as a heuristic to improve coalescing.
 * In particular, if the dimension of the tile is greater than
 * the dimension of the block, then the schedule mapping to the tile
 * is broken up into two pieces and the filter is applied to the inner part.
 * If, on the other hand, the dimension of the tile is smaller than
 * the dimension of the block, then the initial thread identifiers
 * are equated to zero and the remaining thread identifiers are
 * matched to the memory tile.
 *
 * The extension is inserted before the core computation in case of a read
 * and after the core computation in case of a write.
 * In the case of a read, we first need to make sure there is some
 * synchronization before the core computation such that we can put the read
 * from global memory to shared memory before that synchronization.
 * This ensures that all threads have finished copying into shared memory
 * before the shared memory is used.
 * We also need to make sure that there is a synchronization node after
 * the core computation to ensure that the next load into shared memory
 * only happens after all data has been used.  There is no need for
 * this synchronization if we are at the outer level since then there
 * won't be a next load.
 * In the case of a write, we need to make sure there is some synchronization
 * after the core computation such that we can put the write from shared
 * memory to global memory after that synchronization.
 * Unless we are at the outer level, we also need a synchronization node
 * after the write to ensure the data is saved to global memory
 * before the next iteration writes to the same shared memory.
 * It also makes sure the data has arrived in global memory before
 * it is read in a subsequent iteration.
 */
static __isl_give isl_schedule_node *add_copies_group_shared(
	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
	__isl_take isl_schedule_node *node, int read)
{
	struct gpu_array_tile *tile;
	isl_union_map *access;
	isl_union_set *domain;
	isl_multi_aff *ma;
	isl_multi_aff *from_access;
	isl_multi_pw_aff *mpa;
	isl_multi_union_pw_aff *mupa;
	isl_schedule_node *graft;
	isl_union_set *filter;
	int skip;
	int kernel_depth;
	int empty;

	tile = gpu_array_ref_group_tile(group);
	kernel_depth = isl_schedule_node_get_schedule_depth(node);
	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);

	access = anchored_non_local_accesses(kernel, group, node, read);
	empty = isl_union_map_is_empty(access);
	if (empty < 0 || empty) {
		isl_union_map_free(access);
		if (empty < 0)
			return isl_schedule_node_free(node);
		return gpu_tree_move_up_to_kernel(node);
	}

	group->array->global = 1;
	group->local_array->global = 1;

	from_access = create_from_access(kernel->ctx, group, read);

	ma = isl_multi_aff_copy(tile->tiling);
	ma = isl_multi_aff_pullback_multi_aff(ma,
					    isl_multi_aff_copy(from_access));
	mpa = isl_multi_pw_aff_from_multi_aff(ma);
	mupa = isl_multi_union_pw_aff_from_multi_pw_aff(mpa);

	domain = isl_union_map_range(access);

	if (read && !gpu_array_is_scalar(group->array)) {
		isl_map *map;
		isl_union_set_free(domain);
		map = group_tile(group);
		domain = isl_union_set_from_set(isl_map_wrap(map));
	}

	domain = isl_union_set_preimage_multi_aff(domain, from_access);
	access = isl_union_set_wrapped_domain_map(domain);
	access = isl_union_map_reverse(access);
	access = isl_union_map_coalesce(access);
	graft = isl_schedule_node_from_extension(access);

	graft = isl_schedule_node_child(graft, 0);

	graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
	if (kernel->options->unroll_copy_shared)
		graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll);

	if (tile->n > kernel->n_block && kernel->n_block > 0) {
		graft = isl_schedule_node_band_split(graft,
						tile->n - kernel->n_block);
		graft = isl_schedule_node_child(graft, 0);
	}
	if (tile->n < kernel->n_block)
		skip = kernel->n_block - tile->n;
	else
		skip = 0;
	filter = set_schedule_modulo(graft, kernel->thread_ids,
					kernel->block_dim);
	if (!kernel->options->wrap)
		graft = snap_band_to_sizes(graft, kernel->block_dim + skip,
			    kernel->options);
	if (tile->n > kernel->n_block && kernel->n_block > 0)
		graft = isl_schedule_node_parent(graft);
	graft = isl_schedule_node_insert_filter(graft, filter);

	while (graft && isl_schedule_node_has_parent(graft))
		graft = isl_schedule_node_parent(graft);

	if (read) {
		if (kernel_depth < tile->depth)
			node = gpu_tree_ensure_sync_after_core(node, kernel);
		node = gpu_tree_move_left_to_sync(node, kernel);
		node = isl_schedule_node_graft_before(node, graft);
	} else {
		node = gpu_tree_move_right_to_sync(node, kernel);
		node = isl_schedule_node_graft_after(node, graft);
		if (kernel_depth < tile->depth)
			node = add_group_write_sync(node, kernel, group, 1);
	}

	node = gpu_tree_move_up_to_kernel(node);

	return node;
}

/* Check whether the array reference group "group" is mapped to
 * private or shared memory and, if so,
 * add copy statements to the schedule tree of "node"
 * for reading from global memory to private or shared memory
 * (if "read" is set) or for writing back from private or shared memory
 * to global memory (if "read" is not set) for this group.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 */
static __isl_give isl_schedule_node *add_copies_group(
	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
	__isl_take isl_schedule_node *node, int read)
{
	enum ppcg_group_access_type type;

	type = gpu_array_ref_group_type(group);
	if (type == ppcg_access_private)
		return add_copies_group_private(kernel, group, node, read);
	if (type == ppcg_access_shared)
		return add_copies_group_shared(kernel, group, node, read);
	return node;
}

/* For each array reference group that is mapped to private or shared memory,
 * add copy statements to the schedule tree of "node"
 * for reading from global memory to private or shared memory
 * and for writing back.
 * On input, "node" points to the kernel node, and it is moved
 * back there on output.
 */
static __isl_give isl_schedule_node *add_copies(struct ppcg_kernel *kernel,
	__isl_take isl_schedule_node *node)
{
	int i, j;

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *array = &kernel->array[i];

		for (j = 0; j < array->n_group; ++j) {
			struct gpu_array_ref_group *group = array->groups[j];

			node = add_copies_group(kernel, group, node, 1);
			if (!node)
				return NULL;
			node = add_copies_group(kernel, group, node, 0);
			if (!node)
				return NULL;
		}
	}

	return node;
}

/* Mark all dimensions in the current band node atomic.
 */
static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node)
{
	return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
}

/* Mark "node" atomic, if it is a band node.
 * Do the same for all ancestors.
 * Return a pointer to "node" (in the updated schedule tree).
 */
static __isl_give isl_schedule_node *atomic_ancestors(
	__isl_take isl_schedule_node *node)
{
	int pos;

	if (!node)
		return NULL;
	if (!isl_schedule_node_has_parent(node))
		return node;

	pos = isl_schedule_node_get_child_position(node);
	node = isl_schedule_node_parent(node);
	if (isl_schedule_node_get_type(node) == isl_schedule_node_band)
		node = atomic(node);
	node = atomic_ancestors(node);
	node = isl_schedule_node_child(node, pos);

	return node;
}

/* Collect all write references that require synchronization.
 * "node" is assumed to point to the kernel node.
 * Each reference is represented by a universe set in a space
 *
 *	[S[i,j] -> R[]]
 *
 * with S[i,j] the statement instance space and R[] the array reference.
 *
 * This function should be called before block and thread filters are added.
 *
 * Synchronization is needed after a write if there is a subsequent read
 * within the same block that may not be performed by the same thread.
 * There should not be any dependences between different blocks,
 * so we start with the flow dependences within the same kernel invocation
 * and we subtract from these those dependences that are mapped
 * to the same iteration of the bands where synchronization is inserted.
 * We do not remove pairs of instances that are known to map to
 * the same thread across different iterations of the intermediate
 * bands because the read may be performed by a different thread
 * than the one that needs the value if shared memory is involved.
 *
 * We also consider all pairs of possible writes that access the same
 * memory location and that may be mapped to the same block but not
 * to the same iteration of the intermediate bands.
 * In theory, it would be possible for one thread to still be in
 * a previous iteration of a loop in these bands.
 * A write to global memory in this delayed thread could then overwrite
 * a write from another thread that has already moved on to
 * the next iteration.
 *
 * After computing the above writes paired off with reads or writes
 * that depend on them, we project onto the domain writes.
 * Sychronization is needed after writes to global memory
 * through these references.
 */
static __isl_give isl_union_set *compute_sync_writes(
	struct ppcg_kernel *kernel, __isl_keep isl_schedule_node *node)
{
	isl_union_map *local;
	isl_union_map *may_writes, *shared_access;
	isl_union_map *kernel_prefix, *thread_prefix;
	isl_union_map *equal;
	isl_union_set *wrap;
	isl_union_set *domain;
	isl_union_pw_multi_aff *contraction;

	kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
	node = isl_schedule_node_copy(node);
	node = gpu_tree_move_down_to_thread(node, kernel->core);
	thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
	isl_schedule_node_free(node);

	contraction = kernel->contraction;
	kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
		    kernel_prefix, isl_union_pw_multi_aff_copy(contraction));
	thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
		    thread_prefix, isl_union_pw_multi_aff_copy(contraction));
	domain = isl_union_set_copy(kernel->expanded_domain);
	domain = isl_union_set_universe(domain);

	may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes);
	may_writes = isl_union_map_curry(may_writes);
	may_writes = isl_union_map_intersect_domain(may_writes, domain);
	may_writes = isl_union_map_uncurry(may_writes);
	shared_access = isl_union_map_copy(may_writes);
	shared_access = isl_union_map_apply_range(shared_access,
					isl_union_map_reverse(may_writes));

	local = isl_union_map_copy(kernel->prog->scop->tagged_dep_flow);
	local = isl_union_map_union(local, shared_access);
	local = isl_union_map_zip(local);

	equal = isl_union_map_apply_range(kernel_prefix,
		    isl_union_map_reverse(isl_union_map_copy(kernel_prefix)));
	wrap = isl_union_map_wrap(equal);
	local = isl_union_map_intersect_domain(local, wrap);
	equal = isl_union_map_apply_range(thread_prefix,
		    isl_union_map_reverse(isl_union_map_copy(thread_prefix)));
	wrap = isl_union_map_wrap(equal);
	local = isl_union_map_subtract_domain(local, wrap);

	local = isl_union_map_zip(local);
	local = isl_union_map_universe(local);

	return isl_union_map_domain(local);
}

/* Group the domain elements into a single space, named kernelX,
 * with X the kernel sequence number "kernel_id".
 */
static __isl_give isl_schedule_node *group_statements(
	__isl_take isl_schedule_node *node, int kernel_id)
{
	char buffer[20];
	isl_id *id;

	if (!node)
		return NULL;

	snprintf(buffer, sizeof(buffer), "kernel%d", kernel_id);
	id = isl_id_alloc(isl_schedule_node_get_ctx(node), buffer, NULL);
	return isl_schedule_node_group(node, id);
}

/* Create a ppcg_kernel representing the domain instances that reach "node"
 * and insert a mark node pointing to the ppcg_kernel before "node".
 * The band that "node" points to is the band that needs to be mapped
 * to block identifiers.  The band that needs to be mapped to thread
 * identifiers should be marked by a "thread" mark by the caller.
 * The linear branch between the current node and the "thread" mark
 * may also have a "shared" mark.  If present, the mapping to shared
 * memory is computed at that point.
 * Both marks are removed by this function.
 * If "scale" is set, then the band that "node" points to is scaled
 * by "sizes".
 *
 * Mark all outer band nodes as atomic to ensure each kernel is only
 * scheduled once.
 * If the domain elements that reach "node" live in more than one space,
 * then group the domain elements into a single space, named kernelX,
 * with X the kernel sequence number.
 *
 * Insert a guard node governing the kernel node to ensure that
 * no kernels with zero blocks are launched.
 *
 * Insert a context node describing the block and thread
 * identifiers inside the kernel mark.
 * The context node needs to be inserted after the effective block size
 * has been determined such that the bounds on the thread identifiers
 * would reflect the effective block size.
 * Insert a filter node inside the context node mapping the statement
 * instances to block identifiers.  In particular, the block identifiers
 * are equated to the partial schedule of band that was marked for mapping
 * to blocks modulo the grid size.
 * Insert a filter node inside the "thread" mark mapping the statement
 * instances to thread identifiers.  In particular, the thread identifiers
 * are equated to the partial schedule of band that was marked for mapping
 * to threads modulo the block size.
 *
 * Compute array reference groups for all arrays, set the local
 * array bounds based on the set of domain instances that reach
 * the kernel node, check the total amount of shared memory used
 * and compute all group tilings.
 * The array reference groups are computed after the block filter
 * has been inserted because it affects the mapping to shared or
 * private memory.  This computation also requires the thread filter
 * (in the ppcg_kernel object), but this thread filter should not
 * have been added to the schedule tree yet since the computation
 * requires the schedule of the band that needs to be mapped to
 * threads before the privatization is applied.
 *
 * If any array reference group requires the band mapped to threads
 * to be unrolled, then we perform the required unrolling.
 *
 * We save a copy of the schedule that may influence the mappings
 * to shared or private memory in kernel->copy_schedule.
 *
 * Finally, we add synchronization and copy statements to the schedule tree,
 * remove the "thread" mark and create representations for the local
 * variables in the kernel.
 *
 * We keep a copy of the isl_id that points to the kernel to ensure
 * that the kernel does not get destroyed if the schedule node
 * is freed due to some error condition.
 */
__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node, int scale,
	__isl_keep isl_multi_val *sizes)
{
	struct ppcg_kernel *kernel;
	isl_id *id;
	isl_schedule_node *node_thread;
	isl_union_map *host_schedule;
	isl_union_pw_multi_aff *contraction;
	isl_set *host_domain;
	isl_union_set *domain, *expanded;
	int single_statement;

	node = gpu_tree_insert_shared_before_thread(node);
	if (!node)
		return NULL;

	kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
	kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog);
	if (!kernel)
		return isl_schedule_node_free(node);

	domain = isl_schedule_node_get_domain(node);
	single_statement = isl_union_set_n_set(domain) == 1;

	kernel->ctx = gen->ctx;
	kernel->prog = gen->prog;
	kernel->options = gen->options;
	kernel->context = extract_context(node, gen->prog);
	kernel->core = isl_union_set_universe(isl_union_set_copy(domain));
	contraction = isl_schedule_node_get_subtree_contraction(node);
	kernel->contraction = isl_union_pw_multi_aff_copy(contraction);
	expanded = isl_union_set_copy(domain);
	expanded = isl_union_set_preimage_union_pw_multi_aff(expanded,
						contraction);
	kernel->expanded_domain = isl_union_set_copy(expanded);
	kernel->arrays = accessed_by_domain(expanded, gen->prog);
	kernel->n_grid = n_outer_coincidence(node);
	node_thread = isl_schedule_node_copy(node);
	node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core);
	node_thread = isl_schedule_node_child(node_thread, 0);
	kernel->n_block = n_outer_coincidence(node_thread);
	isl_schedule_node_free(node_thread);
	kernel->id = gen->kernel_id++;
	if (read_grid_and_block_sizes(kernel, gen) < 0)
		node = isl_schedule_node_free(node);

	kernel->sync_writes = compute_sync_writes(kernel, node);

	host_schedule = isl_schedule_node_get_prefix_schedule_union_map(node);
	host_domain = isl_set_from_union_set(isl_union_map_range(
								host_schedule));

	node = atomic_ancestors(node);

	id = isl_id_alloc(gen->ctx, "kernel", kernel);
	id = isl_id_set_free_user(id, &ppcg_kernel_free_wrap);
	node = isl_schedule_node_insert_mark(node, isl_id_copy(id));

	if (!single_statement)
		node = group_statements(node, kernel->id);

	node = isl_schedule_node_child(node, 0);
	node = split_band(node, kernel->n_grid);
	kernel->block_ids = ppcg_scop_generate_names(gen->prog->scop,
						kernel->n_grid, "b");
	kernel->block_filter = set_schedule_modulo(node, kernel->block_ids,
						kernel->grid_dim);
	kernel->grid_size = extract_grid_size(kernel,
						isl_union_set_copy(domain));
	if (!kernel->options->wrap)
		node = snap_band_to_sizes(node, kernel->grid_dim,
						kernel->options);
	if (scale)
		node = scale_band(node, isl_multi_val_copy(sizes));
	node = isl_schedule_node_parent(node);
	if (!single_statement)
		node = isl_schedule_node_parent(node);
	node = insert_guard(node, kernel->context, kernel->grid_size,
				gen->prog->scop);
	node = gpu_tree_move_down_to_thread(node, kernel->core);
	node = isl_schedule_node_child(node, 0);
	node = split_band(node, kernel->n_block);
	kernel->thread_ids = ppcg_scop_generate_names(gen->prog->scop,
						kernel->n_block, "t");
	kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids,
						kernel->block_dim);
	if (extract_block_size(kernel, domain) < 0)
		node = isl_schedule_node_free(node);

	node = gpu_tree_move_up_to_kernel(node);
	node = isl_schedule_node_child(node, 0);
	node = insert_context(kernel, node);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_insert_filter(node,
				    isl_union_set_copy(kernel->block_filter));

	node = gpu_tree_move_up_to_kernel(node);

	if (gpu_group_references(kernel, node) < 0)
		node = isl_schedule_node_free(node);
	localize_bounds(kernel, host_domain);
	isl_set_free(host_domain);

	check_shared_memory_bound(kernel);
	mark_global_arrays(kernel);
	compute_group_tilings(kernel);

	node = gpu_tree_move_down_to_thread(node, kernel->core);
	node = isl_schedule_node_child(node, 0);
	if (!kernel->options->wrap)
		node = snap_band_to_sizes(node, kernel->block_dim,
						kernel->options);
	node = isl_schedule_node_insert_filter(node,
				    isl_union_set_copy(kernel->thread_filter));
	if (kernel_requires_unroll(kernel)) {
		node = isl_schedule_node_child(node, 0);
		node = unroll(node);
	}

	node = gpu_tree_move_up_to_thread(node);
	kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
	kernel->copy_schedule =
		isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
	kernel->copy_schedule =
		isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
					    kernel->copy_schedule, contraction);

	node = gpu_tree_move_up_to_kernel(node);

	node = add_sync(kernel, node);
	node = add_copies(kernel, node);

	node = gpu_tree_move_down_to_shared(node, kernel->core);
	node = isl_schedule_node_delete(node);

	node = gpu_tree_move_down_to_thread(node, kernel->core);
	node = isl_schedule_node_delete(node);

	node = gpu_tree_move_up_to_kernel(node);

	if (create_kernel_vars(kernel) < 0)
		node = isl_schedule_node_free(node);

	if (!single_statement)
		node = isl_schedule_node_parent(node);
	node = isl_schedule_node_parent(node);

	isl_id_free(id);
	if (!id)
		ppcg_kernel_free(kernel);
	return node;
}

/* Insert a zero-dimensional permutable band at "node".
 */
static __isl_give isl_schedule_node *insert_empty_permutable_band(
	__isl_take isl_schedule_node *node)
{
	isl_space *space;
	isl_schedule *schedule;
	isl_union_set *domain;
	isl_multi_union_pw_aff *mupa;

	schedule = isl_schedule_node_get_schedule(node);
	domain = isl_schedule_get_domain(schedule);
	space = isl_union_set_get_space(domain);
	isl_union_set_free(domain);
	isl_schedule_free(schedule);

	space = isl_space_set_from_params(space);
	mupa = isl_multi_union_pw_aff_zero(space);
	node = isl_schedule_node_insert_partial_schedule(node, mupa);
	node = isl_schedule_node_band_set_permutable(node, 1);

	return node;
}

/* See if hybrid tiling can be performed on "node" and its parent.
 * If so, apply hybrid tiling and return the updated schedule tree.
 * If not, return the original schedule tree.
 * Return NULL on error.
 *
 * First check if "node", together with its parent, meets
 * the basic requirements for hybrid tiling.
 * If so, compute the relative dependence distances of "node"
 * with respect to its parent and check if they are sufficiently bounded.
 * If so, apply hybrid tiling using user specified tile sizes.
 *
 * The tile sizes are read before the dependence distance bounds are
 * computed, because the user may have specified fewer dimensions
 * than are available.  In this case, the remaining schedule dimensions
 * are split off and the dependence distances should be computed
 * after these dimensions have been split off.
 */
static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node)
{
	int tile_len;
	int *tile_size;
	isl_bool ok;
	isl_schedule_node *orig = node;
	ppcg_ht_bounds *bounds;

	ok = ppcg_ht_parent_has_input_pattern(node);
	if (ok < 0)
		return isl_schedule_node_free(node);
	if (!ok)
		return orig;

	tile_len = 1 + isl_schedule_node_band_n_member(node);
	tile_size = read_tile_sizes(gen, &tile_len);
	if (!tile_size)
		return isl_schedule_node_free(node);

	node = isl_schedule_node_copy(node);
	node = split_band(node, tile_len - 1);
	node = isl_schedule_node_parent(node);
	bounds = ppcg_ht_compute_bounds(gen->prog->scop, node);
	node = isl_schedule_node_child(node, 0);

	ok = ppcg_ht_bounds_is_valid(bounds);
	if (ok >= 0 && ok)
		node = gpu_hybrid_tile(gen, node, bounds, tile_size);
	else
		ppcg_ht_bounds_free(bounds);
	free(tile_size);

	if (ok >= 0 && !ok) {
		isl_schedule_node_free(node);
		return orig;
	}
	isl_schedule_node_free(orig);
	if (ok < 0)
		return isl_schedule_node_free(node);
	return node;
}

/* If "node" is the outermost permutable band that can be mapped to block and
 * thread identifiers in its branch (or the root of a subtree with
 * no such outer bands),
 * then mark the band as such, attaching a ppcg_kernel to the mark.
 *
 * If hybrid tiling is allowed, then first try and apply it
 * to "node" and its parent.
 *
 * If "node" is the root of a subtree without permutable bands,
 * then insert a zero-dimensional permutable band such that
 * we can assume that "node" always points to a band node.
 * This includes the case where "node" already points to a band node,
 * but one without any coincident dimension.  In this case,
 * the extra node ensures that this original node does not get tiled.
 *
 * Tile "node" using user specified tile sizes, after splitting the band
 * if the number of specified tile sizes is smaller than the dimension
 * of the band.  Mark the point band of this tiling as the band that
 * needs to be mapped to threads and instruct the AST generator to unroll
 * the band if the "unroll_gpu_tile" option is set.
 * Create a kernel representing the domain instances that reach "node" and
 * insert a mark node pointing to the ppcg_kernel before the band node.
 */
static __isl_give isl_schedule_node *mark_outer_permutable(
	__isl_take isl_schedule_node *node, void *user)
{
	struct gpu_gen *gen = user;
	int outer;
	int scale;
	int tile_len;
	int *tile_size;
	isl_id *id;
	isl_multi_val *sizes;

	outer = is_outer_tilable(node);
	if (outer < 0)
		return isl_schedule_node_free(node);
	if (!outer)
		return node;

	if (gen->options->hybrid) {
		isl_schedule_node *saved = isl_schedule_node_copy(node);
		node = try_hybrid_tile(gen, node);
		isl_schedule_node_free(saved);
		if (node != saved)
			return node;
	}

	if (isl_schedule_node_get_type(node) != isl_schedule_node_band ||
	    !isl_schedule_node_band_member_get_coincident(node, 0))
		node = insert_empty_permutable_band(node);

	tile_len = isl_schedule_node_band_n_member(node);
	tile_size = read_tile_sizes(gen, &tile_len);
	if (!tile_size)
		return isl_schedule_node_free(node);
	if (tile_len < isl_schedule_node_band_n_member(node))
		node = isl_schedule_node_band_split(node, tile_len);
	sizes = construct_band_tiles_sizes(node, tile_size);
	node = tile_band(node, isl_multi_val_copy(sizes));
	node = isl_schedule_node_child(node, 0);
	if (gen->options->unroll_gpu_tile)
		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
	id = isl_id_alloc(gen->ctx, "thread", NULL);
	node = isl_schedule_node_insert_mark(node, id);
	node = isl_schedule_node_parent(node);

	scale = gen->options->scale_tile_loops;
	node = gpu_create_kernel(gen, node, scale, sizes);
	isl_multi_val_free(sizes);
	free(tile_size);

	return node;
}

/* Given a set or sequence node, return the union the filters of either all
 * (if "only_initial" is not set) or the initial (if "only_initial" is set)
 * direct subtrees that do not contain any suitably permutable bands
 * (according to subtree_has_permutable_bands).
 */
static __isl_give isl_union_set *get_non_parallel_subtree_filters(
	__isl_keep isl_schedule_node *node, int only_initial)
{
	isl_space *space;
	isl_union_set *filter;
	int i, n;

	n = isl_schedule_node_n_children(node);
	if (n < 0)
		return NULL;

	node = isl_schedule_node_copy(node);
	node = isl_schedule_node_child(node, 0);
	filter = isl_schedule_node_filter_get_filter(node);
	node = isl_schedule_node_parent(node);
	space = isl_union_set_get_space(filter);
	isl_union_set_free(filter);
	filter = isl_union_set_empty(space);

	for (i = 0; i < n; ++i) {
		int parallelism;

		node = isl_schedule_node_child(node, i);
		parallelism = subtree_has_permutable_bands(node);
		if (parallelism < 0) {
			filter = isl_union_set_free(filter);
		} else if (!parallelism) {
			isl_union_set *filter_i;
			filter_i = isl_schedule_node_filter_get_filter(node);
			filter = isl_union_set_union(filter, filter_i);
		} else if (only_initial)
			break;
		node = isl_schedule_node_parent(node);
	}

	isl_schedule_node_free(node);

	return filter;
}

/* Given a set or sequence node, return the union of the filters of
 * the direct subtrees that do not contain any suitably permutable bands
 * (according to subtree_has_permutable_bands).
 */
static __isl_give isl_union_set *get_all_non_parallel_subtree_filters(
	__isl_keep isl_schedule_node *node)
{
	return get_non_parallel_subtree_filters(node, 0);
}

/* Given a set or sequence node, return the union of the filters of
 * the initial direct subtrees that do not contain any suitably permutable
 * bands (according to subtree_has_permutable_bands).
 */
static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters(
	__isl_keep isl_schedule_node *node)
{
	return get_non_parallel_subtree_filters(node, 1);
}

/* Mark all variables that are accessed by the statement instances in "domain"
 * and that are local to "prog" as requiring a declaration in the host code.
 * The statement instances in "domain" correspond to (a subset of)
 * the active instances at "node".
 * "node" is not modified by this function, except that NULL is returned
 * in case of error.
 */
static __isl_give isl_schedule_node *declare_accessed_local_variables(
	__isl_take isl_schedule_node *node, struct gpu_prog *prog,
	__isl_keep isl_union_set *domain)
{
	isl_union_pw_multi_aff *contraction;
	isl_union_set *arrays;
	int i;

	if (!ppcg_scop_any_hidden_declarations(prog->scop))
		return node;
	contraction = isl_schedule_node_get_subtree_contraction(node);
	domain = isl_union_set_copy(domain);
	domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction);
	arrays = accessed_by_domain(domain, prog);

	for (i = 0; i < prog->n_array; ++i) {
		isl_space *space;
		isl_set *set;
		int empty;

		if (!prog->array[i].local)
			continue;
		space = isl_set_get_space(prog->array[i].extent);
		set = isl_union_set_extract_set(arrays, space);
		empty = isl_set_plain_is_empty(set);
		isl_set_free(set);
		if (empty < 0)
			goto error;
		if (!empty)
			prog->array[i].declare_local = 1;
	}

	isl_union_set_free(arrays);
	return node;
error:
	isl_union_set_free(arrays);
	return isl_schedule_node_free(node);
}

/* If "node" points to a set node, then separate its children
 * into subtrees that have suitably permutable bands and
 * those that do not.
 * Adjust the schedule tree in order to execute the second group
 * after the first group and return a pointer to the first group,
 * assuming there are any such subtrees.
 * If "node" points to a sequence node, then separate the initial
 * children that do not have suitably permutable bands and
 * return a pointer to the subsequence of children that do have such bands,
 * assuming there are any such subtrees.
 *
 * In both cases, mark all local variables in "prog" that are accessed by
 * the group without permutable bands as requiring a declaration on the host.
 */
static __isl_give isl_schedule_node *isolate_permutable_subtrees(
	__isl_take isl_schedule_node *node, struct gpu_prog *prog)
{
	isl_union_set *filter;
	enum isl_schedule_node_type type;

	if (!node)
		return NULL;
	type = isl_schedule_node_get_type(node);
	if (type == isl_schedule_node_set) {
		filter = get_all_non_parallel_subtree_filters(node);
		node = declare_accessed_local_variables(node, prog, filter);
		node = isl_schedule_node_order_after(node, filter);
	} else if (type == isl_schedule_node_sequence) {
		filter = get_initial_non_parallel_subtree_filters(node);
		node = declare_accessed_local_variables(node, prog, filter);
		node = isl_schedule_node_order_before(node, filter);
	}

	return node;
}

/* Replace any reference to an array element in the range of "copy"
 * by a reference to all array elements (defined by the extent of the array).
 */
static __isl_give isl_union_map *approximate_copy_out(
	__isl_take isl_union_map *copy, struct gpu_prog *prog)
{
	int i;
	isl_union_map *res;

	res = isl_union_map_empty(isl_union_map_get_space(copy));

	for (i = 0; i < prog->n_array; ++i) {
		isl_space *space;
		isl_set *set;
		isl_union_map *copy_i;
		isl_union_set *extent, *domain;

		space = isl_space_copy(prog->array[i].space);
		extent = isl_union_set_from_set(isl_set_universe(space));
		copy_i = isl_union_map_copy(copy);
		copy_i = isl_union_map_intersect_range(copy_i, extent);
		set = isl_set_copy(prog->array[i].extent);
		extent = isl_union_set_from_set(set);
		domain = isl_union_map_domain(copy_i);
		copy_i = isl_union_map_from_domain_and_range(domain, extent);
		res = isl_union_map_union(res, copy_i);
	}

	isl_union_map_free(copy);

	return res;
}

/* Insert "kernel" marks that point to a ppcg_kernel structure
 * in front of all outermost tilable band that (by construction)
 * have at least one parallel loop.
 */
static __isl_give isl_schedule_node *mark_kernels(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node)
{
	return isl_schedule_node_map_descendant_bottom_up(node,
						&mark_outer_permutable, gen);
}

/* Construct schedule constraints from the dependences in prog->scop and
 * the array order dependences in prog->array_order.
 *
 * If live range reordering is allowed, then we need to make sure
 * that live ranges on arrays are not run in parallel since doing
 * so would require array expansion.  We therefore add the array
 * order dependences to the coincidence dependences.  Non-zero array
 * order dependences will then prevent a schedule dimension from being
 * considered parallel.
 * Live ranges derived from scalars are allowed to be run in parallel
 * since we force the scalars to be mapped to private memory in
 * check_scalar_live_ranges.
 * If live range reordering is allowed, then the false dependences
 * are not added to the validity constraints as that would prevent
 * reordering.  Instead, the external false dependences that enforce that reads
 * from potentially live-in data precede any later write and
 * that writes of potentially live-out data follow any other earlier write
 * are added to the validity and the coincidence constraints.
 * The false dependences are still added to the proximity constraints
 * for consistency with the case where live range reordering is not allowed.
 * The coincidence constraints then consist of flow dependences,
 * external false dependences and array order dependences.
 * The independences can be filtered out from the first two sets.
 * They have already been filtered out from the array order dependences
 * on a per array basis in collect_order_dependences.
 * There is no need for a per array handling of the other two sets
 * as there should be no flow or external false dependence on local
 * variables that can be filtered out.
 */
static __isl_give isl_schedule_constraints *construct_schedule_constraints(
	struct gpu_prog *prog)
{
	isl_union_set *domain;
	isl_union_map *dep_raw, *dep;
	isl_union_map *validity, *proximity, *coincidence;
	isl_schedule_constraints *sc;

	domain = isl_union_set_copy(prog->scop->domain);
	sc = isl_schedule_constraints_on_domain(domain);
	sc = isl_schedule_constraints_set_context(sc,
				isl_set_copy(prog->scop->context));
	if (prog->scop->options->live_range_reordering) {
		sc = isl_schedule_constraints_set_conditional_validity(sc,
			isl_union_map_copy(prog->scop->tagged_dep_flow),
			isl_union_map_copy(prog->scop->tagged_dep_order));
		proximity = isl_union_map_copy(prog->scop->dep_flow);
		validity = isl_union_map_copy(proximity);
		validity = isl_union_map_union(validity,
			    isl_union_map_copy(prog->scop->dep_forced));
		proximity = isl_union_map_union(proximity,
			    isl_union_map_copy(prog->scop->dep_false));
		coincidence = isl_union_map_copy(validity);
		coincidence = isl_union_map_subtract(coincidence,
			isl_union_map_copy(prog->scop->independence));
		coincidence = isl_union_map_union(coincidence,
				isl_union_map_copy(prog->array_order));
	} else {
		dep_raw = isl_union_map_copy(prog->scop->dep_flow);
		dep = isl_union_map_copy(prog->scop->dep_false);
		dep = isl_union_map_union(dep, dep_raw);
		dep = isl_union_map_coalesce(dep);
		proximity = isl_union_map_copy(dep);
		coincidence = isl_union_map_copy(dep);
		validity = dep;
	}
	sc = isl_schedule_constraints_set_validity(sc, validity);
	sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
	sc = isl_schedule_constraints_set_proximity(sc, proximity);

	return sc;
}

/* Compute an appropriate schedule based on the accesses in
 * gen->read and gen->write.
 *
 * We derive schedule constraints from the dependences in gen->prog->scop
 * and then use isl to compute a schedule that has a parallel loop
 * in each tilable band.
 * During the schedule construction, some statement instances
 * may be grouped first based on the input schedule.
 */
static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen)
{
	isl_schedule_constraints *sc;
	isl_schedule *schedule;

	sc = construct_schedule_constraints(gen->prog);
	schedule = gen->prog->scop->schedule;
	schedule = ppcg_compute_schedule(sc, schedule, gen->options);

	return schedule;
}

/* If the band node "node" has exactly one member then mark it permutable.
 */
static __isl_give isl_schedule_node *band_set_permutable(
	__isl_take isl_schedule_node *node,
	__isl_keep isl_schedule_constraints *sc)
{
	if (isl_schedule_node_band_n_member(node) == 1)
		node = isl_schedule_node_band_set_permutable(node, 1);

	return node;
}

/* Return the coincidence constraints between pairs of instances
 * that are scheduled together by the ancestors of "node".
 * That is, select those coincidence constraints that relate
 * pairs of instances that have the same value for the prefix schedule.
 * If the schedule depth is zero, then the prefix schedule does not
 * contain any information, so we intersect domain and range
 * of the schedule constraints with the reaching domain elements instead.
 */
static __isl_give isl_union_map *get_local_coincidence(
	__isl_keep isl_schedule_node *node,
	__isl_keep isl_schedule_constraints *sc)
{
	isl_union_map *coincidence;
	isl_multi_union_pw_aff *prefix;
	isl_union_pw_multi_aff *contraction;

	coincidence = isl_schedule_constraints_get_coincidence(sc);
	contraction = isl_schedule_node_get_subtree_contraction(node);
	if (isl_schedule_node_get_schedule_depth(node) == 0) {
		isl_union_set *domain;

		domain = isl_schedule_node_get_domain(node);
		domain = isl_union_set_preimage_union_pw_multi_aff(domain,
						    contraction);
		coincidence = isl_union_map_intersect_domain(coincidence,
						    isl_union_set_copy(domain));
		coincidence = isl_union_map_intersect_range(coincidence,
						    domain);
		return coincidence;
	}

	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
	prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
								contraction);
	return isl_union_map_eq_at_multi_union_pw_aff(coincidence, prefix);
}

/* For each member in the band node "node", determine whether
 * it is coincident with respect to the outer nodes and mark
 * it accordingly.
 *
 * That is, for each coincidence constraint between pairs
 * of instances that are scheduled together by the outer nodes,
 * check that domain and range are assigned the same value
 * by the band member.  This test is performed by checking
 * that imposing the same value for the band member does not
 * remove any elements from the set of coincidence constraints.
 */
static __isl_give isl_schedule_node *band_set_coincident(
	__isl_take isl_schedule_node *node,
	__isl_keep isl_schedule_constraints *sc)
{
	isl_union_map *coincidence;
	isl_union_pw_multi_aff *contraction;
	isl_multi_union_pw_aff *partial;
	int i, n;

	coincidence = get_local_coincidence(node, sc);

	partial = isl_schedule_node_band_get_partial_schedule(node);
	contraction = isl_schedule_node_get_subtree_contraction(node);
	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
								contraction);
	n = isl_schedule_node_band_n_member(node);
	for (i = 0; i < n; ++i) {
		isl_union_map *coincidence_i;
		isl_union_pw_aff *upa;
		isl_multi_union_pw_aff *partial_i;
		int subset;

		upa = isl_multi_union_pw_aff_get_union_pw_aff(partial, i);
		partial_i = isl_multi_union_pw_aff_from_union_pw_aff(upa);
		coincidence_i = isl_union_map_copy(coincidence);
		coincidence_i = isl_union_map_eq_at_multi_union_pw_aff(
						    coincidence_i, partial_i);
		subset = isl_union_map_is_subset(coincidence, coincidence_i);
		isl_union_map_free(coincidence_i);

		if (subset < 0)
			break;
		node = isl_schedule_node_band_member_set_coincident(node, i,
								    subset);
	}
	if (i < n)
		node = isl_schedule_node_free(node);
	isl_multi_union_pw_aff_free(partial);
	isl_union_map_free(coincidence);

	return node;
}

/* If "node" is a band, then set its properties.
 *
 * In particular, if the band has exactly one member, then mark it permutable.
 * Mark the band members coincident based on the coincidence constraints
 * of "sc".
 */
static __isl_give isl_schedule_node *set_band_properties(
	__isl_take isl_schedule_node *node, void *user)
{
	isl_schedule_constraints *sc = user;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
		return node;
	if (isl_schedule_node_band_n_member(node) == 0)
		return node;

	node = band_set_permutable(node, sc);
	node = band_set_coincident(node, sc);

	return node;
}

/* Return the original schedule with all bands marked permutable and
 * all band members marked coincident based on the coincidence constraints.
 * The bands are explicitly marked permutable so that they will be considered
 * by mark_outer_permutable.
 */
static __isl_give isl_schedule *determine_properties_original_schedule(
	struct gpu_gen *gen)
{
	isl_schedule *schedule;
	isl_schedule_constraints *sc;

	schedule = isl_schedule_copy(gen->prog->scop->schedule);
	sc = construct_schedule_constraints(gen->prog);
	schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
						    &set_band_properties, sc);
	isl_schedule_constraints_free(sc);

	return schedule;
}

/* Compute a schedule or determine the properties of the original schedule
 * depending on the value of the "reschedule" option.
 */
static __isl_give isl_schedule *compute_or_set_properties(void *user)
{
	struct gpu_gen *gen = user;

	if (gen->options->reschedule)
		return compute_schedule(gen);
	else
		return determine_properties_original_schedule(gen);
}

/* Obtain a schedule for the scop, by reading it from
 * a file, by computing one or by determining the properties
 * of the original schedule.
 */
static __isl_give isl_schedule *get_schedule(struct gpu_gen *gen)
{
	return ppcg_get_schedule(gen->ctx, gen->options,
				&compute_or_set_properties, gen);
}

/* Construct the string "<a>_<b>".
 */
static char *concat(isl_ctx *ctx, const char *a, const char *b)
{
	isl_printer *p;
	char *s;

	p = isl_printer_to_str(ctx);
	p = isl_printer_print_str(p, a);
	p = isl_printer_print_str(p, "_");
	p = isl_printer_print_str(p, b);
	s = isl_printer_get_str(p);
	isl_printer_free(p);

	return s;
}

/* For each array in "prog" of which an element appears in "accessed" and
 * that is not a read only scalar, create a zero-dimensional universe set
 * of which the tuple id has name "<prefix>_<name of array>" and a user
 * pointer pointing to the array (gpu_array_info).
 *
 * If the array is local to "prog", then make sure it will be declared
 * in the host code.
 *
 * Return the list of these universe sets.
 */
static __isl_give isl_union_set_list *create_copy_filters(struct gpu_prog *prog,
	const char *prefix, __isl_take isl_union_set *accessed)
{
	int i;
	isl_ctx *ctx;
	isl_union_set_list *filters;

	ctx = prog->ctx;
	filters = isl_union_set_list_alloc(ctx, 0);
	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		isl_space *space;
		isl_set *accessed_i;
		int empty;
		char *name;
		isl_id *id;
		isl_union_set *uset;

		if (gpu_array_is_read_only_scalar(array))
			continue;

		space = isl_space_copy(array->space);
		accessed_i = isl_union_set_extract_set(accessed, space);
		empty = isl_set_plain_is_empty(accessed_i);
		isl_set_free(accessed_i);
		if (empty < 0) {
			filters = isl_union_set_list_free(filters);
			break;
		}
		if (empty)
			continue;

		array->global = 1;
		if (array->local)
			array->declare_local = 1;

		name = concat(ctx, prefix, array->name);
		id = name ? isl_id_alloc(ctx, name, array) : NULL;
		free(name);
		space = isl_space_set_alloc(ctx, 0, 0);
		space = isl_space_set_tuple_id(space, isl_dim_set, id);
		uset = isl_union_set_from_set(isl_set_universe(space));

		filters = isl_union_set_list_add(filters, uset);
	}
	isl_union_set_free(accessed);

	return filters;
}

/* Make sure that code for the statements in "filters" that
 * copy arrays to or from the device is only generated when
 * the size of the corresponding array is positive.
 * That is, add a set node underneath "graft" with "filters" as children
 * and for each child add a guard that the selects the parameter
 * values for which the corresponding array has a positive size.
 * The array is available in the user pointer of the statement identifier.
 * "depth" is the schedule depth of the position where "graft"
 * will be added.
 */
static __isl_give isl_schedule_node *insert_positive_size_guards(
	__isl_take isl_schedule_node *graft,
	__isl_take isl_union_set_list *filters, int depth)
{
	int i, n;

	graft = isl_schedule_node_child(graft, 0);
	graft = isl_schedule_node_insert_set(graft, filters);
	n = isl_schedule_node_n_children(graft);
	for (i = 0; i < n; ++i) {
		isl_union_set *filter;
		isl_set *domain, *guard;
		isl_id *id;
		struct gpu_array_info *array;

		graft = isl_schedule_node_child(graft, i);
		filter = isl_schedule_node_filter_get_filter(graft);
		domain = isl_set_from_union_set(filter);
		id = isl_set_get_tuple_id(domain);
		array = isl_id_get_user(id);
		isl_id_free(id);
		isl_set_free(domain);
		guard = gpu_array_positive_size_guard(array);
		guard = isl_set_from_params(guard);
		guard = isl_set_add_dims(guard, isl_dim_set, depth);
		graft = isl_schedule_node_child(graft, 0);
		graft = isl_schedule_node_insert_guard(graft, guard);
		graft = isl_schedule_node_parent(graft);
		graft = isl_schedule_node_parent(graft);
	}
	graft = isl_schedule_node_parent(graft);

	return graft;
}

/* Create a graft for copying arrays to or from the device,
 * whenever the size of the array is strictly positive.
 * Each statement is called "<prefix>_<name of array>" and
 * the identifier has a user pointer pointing to the array.
 * The graft will be added at the position specified by "node".
 * "copy" contains the array elements that need to be copied.
 * Only arrays of which some elements need to be copied
 * will have a corresponding statement in the graph.
 * Note though that each such statement will copy the entire array.
 */
static __isl_give isl_schedule_node *create_copy_device(struct gpu_prog *prog,
	__isl_keep isl_schedule_node *node, const char *prefix,
	__isl_take isl_union_set *copy)
{
	int depth;
	isl_ctx *ctx;
	isl_space *space;
	isl_union_set *all, *domain;
	isl_union_set_list *filters;
	isl_union_map *extension;
	isl_schedule_node *graft;

	ctx = prog->ctx;
	depth = isl_schedule_node_get_schedule_depth(node);
	filters = create_copy_filters(prog, prefix, copy);
	all = isl_union_set_list_union(isl_union_set_list_copy(filters));

	space = depth < 0 ? NULL : isl_space_set_alloc(ctx, 0, depth);
	domain = isl_union_set_from_set(isl_set_universe(space));
	extension = isl_union_map_from_domain_and_range(domain, all);
	graft = isl_schedule_node_from_extension(extension);

	if (!filters)
		return isl_schedule_node_free(graft);
	if (isl_union_set_list_n_union_set(filters) == 0) {
		isl_union_set_list_free(filters);
		return graft;
	}

	return insert_positive_size_guards(graft, filters, depth);
}

/* Return (the universe spaces of) the arrays that are declared
 * inside the scop corresponding to "prog" and for which all
 * potential writes inside the scop form a subset of "domain".
 */
static __isl_give isl_union_set *extract_local_accesses(struct gpu_prog *prog,
	__isl_keep isl_union_set *domain)
{
	int i;
	isl_union_set *local;

	local = isl_union_set_empty(isl_union_set_get_space(domain));

	for (i = 0; i < prog->n_array; ++i) {
		isl_set *set;
		isl_union_map *to_outer;
		isl_union_map *may_write;
		isl_union_set *write_domain;
		isl_union_set *fields;
		int subset;

		if (!prog->array[i].local)
			continue;

		set = isl_set_universe(isl_space_copy(prog->array[i].space));
		to_outer = isl_union_map_copy(prog->to_outer);
		to_outer = isl_union_map_intersect_range(to_outer,
				    isl_union_set_from_set(isl_set_copy(set)));
		fields = isl_union_map_domain(to_outer);
		may_write = isl_union_map_copy(prog->may_write);
		may_write = isl_union_map_intersect_range(may_write, fields);
		write_domain = isl_union_map_domain(may_write);
		subset = isl_union_set_is_subset(write_domain, domain);
		isl_union_set_free(write_domain);

		if (subset < 0) {
			isl_set_free(set);
			return isl_union_set_free(local);
		} else if (subset) {
			local = isl_union_set_add_set(local, set);
		} else {
			isl_set_free(set);
		}
	}

	return local;
}

/* Internal data structure for node_may_persist.
 *
 * "tagger" maps tagged iteration domains to the corresponding untagged
 *	iteration domain.
 *
 * "may_persist_flow" is the set of all tagged dataflow dependences
 * with those dependences removed that either precede or follow
 * the kernel launch in a sequence.
 * "inner_band_flow" is the set of all tagged dataflow dependences
 * that are local to a given iteration of the outer band nodes
 * with respect to the current node.
 * "local_flow" is equal to "inner_band_flow", except that the domain
 * and the range have been intersected with intermediate filters
 * on children of sets or sequences.
 */
struct ppcg_may_persist_data {
	isl_union_pw_multi_aff *tagger;

	isl_union_map *local_flow;
	isl_union_map *inner_band_flow;
	isl_union_map *may_persist_flow;
};

/* Update the information in "data" based on the band ancestor "node".
 *
 * In particular, we restrict the dependences in data->local_flow
 * to those dependence where the source and the sink occur in
 * the same iteration of the given band node.
 * We also update data->inner_band_flow to the new value of
 * data->local_flow.
 */
static int update_may_persist_at_band(__isl_keep isl_schedule_node *node,
	struct ppcg_may_persist_data *data)
{
	isl_multi_union_pw_aff *partial;
	isl_union_pw_multi_aff *contraction;
	isl_union_map *flow;

	if (isl_schedule_node_band_n_member(node) == 0)
		return 0;

	partial = isl_schedule_node_band_get_partial_schedule(node);
	contraction = isl_schedule_node_get_subtree_contraction(node);
	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
								contraction);
	partial = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(partial,
				isl_union_pw_multi_aff_copy(data->tagger));

	flow = data->local_flow;
	flow = isl_union_map_eq_at_multi_union_pw_aff(flow, partial);
	data->local_flow = flow;

	isl_union_map_free(data->inner_band_flow);
	data->inner_band_flow = isl_union_map_copy(data->local_flow);

	return 0;
}

/* Given a set of local reaching domain elements "domain",
 * expand them to the corresponding leaf domain elements using "contraction"
 * and insert the array references tags using data->tagger.
 */
static __isl_give isl_union_set *expand_and_tag(
	__isl_take isl_union_set *domain,
	__isl_take isl_union_pw_multi_aff *contraction,
	struct ppcg_may_persist_data *data)
{
	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
			    contraction);
	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
			    isl_union_pw_multi_aff_copy(data->tagger));
	return domain;
}

/* Given a filter node that is the child of a set or sequence node,
 * restrict data->local_flow to refer only to those elements
 * in the filter of the node.
 * "contraction" maps the leaf domain elements of the schedule tree
 * to the corresponding domain elements at (the parent of) "node".
 */
static int filter_flow(__isl_keep isl_schedule_node *node,
	struct ppcg_may_persist_data *data,
	__isl_take isl_union_pw_multi_aff *contraction)
{
	isl_union_set *filter;
	isl_union_map *flow;

	flow = data->local_flow;
	filter = isl_schedule_node_filter_get_filter(node);
	filter = expand_and_tag(filter, contraction, data);
	flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(filter));
	flow = isl_union_map_intersect_range(flow, filter);
	data->local_flow = flow;

	return 0;
}

/* Given a filter node "node", collect the filters on all preceding siblings
 * (which are also filter nodes), add them to "filters" and return the result.
 */
static __isl_give isl_union_set *add_previous_filters(
	__isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
{
	isl_schedule_node *sibling;

	sibling = isl_schedule_node_copy(node);
	while (sibling && isl_schedule_node_has_previous_sibling(sibling)) {
		isl_union_set *filter;

		sibling = isl_schedule_node_previous_sibling(sibling);
		filter = isl_schedule_node_filter_get_filter(sibling);
		filters = isl_union_set_union(filters, filter);
	}
	isl_schedule_node_free(sibling);
	if (!sibling)
		return isl_union_set_free(filters);

	return filters;
}

/* Given a filter node "node", collect the filters on all following siblings
 * (which are also filter nodes), add them to "filters" and return the result.
 */
static __isl_give isl_union_set *add_next_filters(
	__isl_take isl_union_set *filters, __isl_keep isl_schedule_node *node)
{
	isl_schedule_node *sibling;

	sibling = isl_schedule_node_copy(node);
	while (sibling && isl_schedule_node_has_next_sibling(sibling)) {
		isl_union_set *filter;

		sibling = isl_schedule_node_next_sibling(sibling);
		filter = isl_schedule_node_filter_get_filter(sibling);
		filters = isl_union_set_union(filters, filter);
	}
	isl_schedule_node_free(sibling);
	if (!sibling)
		return isl_union_set_free(filters);

	return filters;
}

/* Remove those flow dependences from data->may_persist_flow
 * that flow between elements of "domain" within the same iteration
 * of all outer band nodes.
 * "contraction" maps the leaf domain elements of the schedule tree
 * to the corresponding elements "domain".
 */
static void remove_external_flow(struct ppcg_may_persist_data *data,
	__isl_take isl_union_set *domain,
	__isl_keep isl_union_pw_multi_aff *contraction)
{
	isl_union_map *flow;

	contraction = isl_union_pw_multi_aff_copy(contraction);
	domain = expand_and_tag(domain, contraction, data);
	flow = isl_union_map_copy(data->local_flow);
	flow = isl_union_map_intersect_domain(flow, isl_union_set_copy(domain));
	flow = isl_union_map_intersect_range(flow, domain);

	data->may_persist_flow = isl_union_map_subtract(data->may_persist_flow,
							flow);
}

/* Update the information in "data" based on the filter ancestor "node".
 * We only need to modify anything if the filter is the child
 * of a set or sequence node.
 *
 * In the case of a sequence, we remove the dependences between
 * statement instances that are both executed either before or
 * after the subtree that will be mapped to a kernel, within
 * the same iteration of outer bands.
 *
 * In both cases, we restrict data->local_flow to the current child.
 */
static int update_may_persist_at_filter(__isl_keep isl_schedule_node *node,
	struct ppcg_may_persist_data *data)
{
	enum isl_schedule_node_type type;
	isl_schedule_node *parent;
	isl_space *space;
	isl_union_pw_multi_aff *contraction;
	isl_union_set *before, *after, *filter;

	type = isl_schedule_node_get_parent_type(node);
	if (type != isl_schedule_node_sequence && type != isl_schedule_node_set)
		return 0;

	parent = isl_schedule_node_copy(node);
	parent = isl_schedule_node_parent(parent);
	contraction = isl_schedule_node_get_subtree_contraction(parent);
	isl_schedule_node_free(parent);

	if (type == isl_schedule_node_set)
		return filter_flow(node, data, contraction);

	filter = isl_schedule_node_filter_get_filter(node);
	space = isl_union_set_get_space(filter);
	isl_union_set_free(filter);
	before = isl_union_set_empty(space);
	after = isl_union_set_copy(before);
	before = add_previous_filters(before, node);
	after = add_next_filters(after, node);

	remove_external_flow(data, before, contraction);
	remove_external_flow(data, after, contraction);

	return filter_flow(node, data, contraction);
}

/* Update the information in "data" based on the ancestor "node".
 */
static isl_stat update_may_persist_at(__isl_keep isl_schedule_node *node,
	void *user)
{
	struct ppcg_may_persist_data *data = user;

	switch (isl_schedule_node_get_type(node)) {
	case isl_schedule_node_error:
		return isl_stat_error;
	case isl_schedule_node_context:
	case isl_schedule_node_domain:
	case isl_schedule_node_expansion:
	case isl_schedule_node_extension:
	case isl_schedule_node_guard:
	case isl_schedule_node_leaf:
	case isl_schedule_node_mark:
	case isl_schedule_node_sequence:
	case isl_schedule_node_set:
		break;
	case isl_schedule_node_band:
		if (update_may_persist_at_band(node, data) < 0)
			return isl_stat_error;
		break;
	case isl_schedule_node_filter:
		if (update_may_persist_at_filter(node, data) < 0)
			return isl_stat_error;
		break;
	}

	return isl_stat_ok;
}

/* Determine the set of array elements that may need to be perserved
 * by a kernel constructed from the subtree at "node".
 * This includes the set of array elements that may need to be preserved
 * by the entire scop (prog->may_persist) and the elements for which
 * there is a potential flow dependence that may cross a kernel launch.
 *
 * To determine the second set, we start from all flow dependences.
 * From this set of dependences, we remove those that cannot possibly
 * require data to be preserved by a kernel launch.
 * In particular, we consider the following sets of dependences.
 * - dependences of which the write occurs inside the kernel.
 *   If the data is needed outside the kernel, then it will
 *   be copied out immediately after the kernel launch, so there
 *   is no need for any special care.
 * - dependences of which the read occurs inside the kernel and the
 *   corresponding write occurs inside the same iteration of the
 *   outer band nodes.  This means that the data is needed in
 *   the first kernel launch after the write, which is already
 *   taken care of by the standard copy-in.  That is, the data
 *   do not need to be preserved by any intermediate call to
 *   the same kernel.
 * - dependences of which the write and the read either both occur
 *   before the kernel launch or both occur after the kernel launch,
 *   within the same iteration of the outer band nodes with respect
 *   to the sequence that determines the ordering of the dependence
 *   and the kernel launch.  Such flow dependences cannot cross
 *   any kernel launch.
 *
 * For the remaining (tagged) dependences, we take the domain
 * (i.e., the tagged writes) and apply the tagged access relation
 * to obtain the accessed data elements.
 * These are then combined with the elements that may need to be
 * preserved by the entire scop.
 */
static __isl_give isl_union_set *node_may_persist(
	__isl_keep isl_schedule_node *node, struct gpu_prog *prog)
{
	struct ppcg_may_persist_data data;
	isl_union_pw_multi_aff *contraction;
	isl_union_set *domain;
	isl_union_set *persist;
	isl_union_map *flow, *local_flow;

	data.tagger = prog->scop->tagger;

	flow = isl_union_map_copy(prog->scop->tagged_dep_flow);
	data.local_flow = isl_union_map_copy(flow);
	data.inner_band_flow = isl_union_map_copy(flow);
	data.may_persist_flow = flow;
	if (isl_schedule_node_foreach_ancestor_top_down(node,
					&update_may_persist_at, &data) < 0)
		data.may_persist_flow =
				    isl_union_map_free(data.may_persist_flow);
	flow = data.may_persist_flow;
	isl_union_map_free(data.local_flow);

	domain = isl_schedule_node_get_domain(node);
	contraction = isl_schedule_node_get_subtree_contraction(node);
	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
				    contraction);
	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
				    isl_union_pw_multi_aff_copy(data.tagger));
	flow = isl_union_map_subtract_domain(flow, isl_union_set_copy(domain));
	local_flow = data.inner_band_flow;
	local_flow = isl_union_map_intersect_range(local_flow, domain);
	flow = isl_union_map_subtract(flow, local_flow);

	persist = isl_union_map_domain(flow);
	persist = isl_union_set_apply(persist,
			isl_union_map_copy(prog->scop->tagged_may_writes));
	persist = isl_union_set_union(persist,
			isl_union_set_copy(prog->may_persist));

	return persist;
}

/* Add nodes for copying outer arrays in and out of the device
 * before and after the subtree "node", which contains one or more kernels.
 * "domain" contains the original statement instances, i.e.,
 * those that correspond to the domains of the access relations in "prog".
 * In particular, the domain has not been contracted in any way.
 * "prefix" contains the prefix schedule at that point, in terms
 * of the same original statement instances.
 *
 * We first compute the sets of outer array elements that need
 * to be copied in and out and then graft in the nodes for
 * performing this copying.
 *
 * In particular, for each array that is possibly written anywhere in
 * the subtree "node" and that may be used after "node"
 * or that may be visible outside the corresponding scop,
 * we copy out its entire extent.
 *
 * Any array elements that is read without first being written inside
 * the subtree "node" needs to be copied in.
 * Furthermore, if there are any array elements that
 * are copied out, but that may not be written inside "node, then
 * they also need to be copied in to ensure that the value after execution
 * is the same as the value before execution, at least for those array
 * elements that may have their values preserved by the scop or that
 * may be written before "node" and read after "node".
 * In case the array elements are structures, we need to take into
 * account that all members of the structures need to be written
 * by "node" before we can avoid copying the data structure in.
 *
 * Note that the may_write relation is intersected with the domain,
 * which has been intersected with the context.
 * This helps in those cases where the arrays are declared with a fixed size,
 * while the accesses are parametric and the context assigns a fixed value
 * to the parameters.
 *
 * If an element from a local array is read without first being written,
 * then there is no point in copying it in since it cannot have been
 * written prior to the scop.  Warn about the uninitialized read instead.
 */
static __isl_give isl_schedule_node *add_to_from_device(
	__isl_take isl_schedule_node *node, __isl_take isl_union_set *domain,
	__isl_take isl_union_map *prefix, struct gpu_prog *prog)
{
	isl_union_set *local;
	isl_union_set *may_persist;
	isl_union_map *may_write, *must_write, *copy_out, *not_written;
	isl_union_map *read, *copy_in;
	isl_union_map *tagged;
	isl_union_map *local_uninitialized;
	isl_schedule_node *graft;

	tagged = isl_union_map_copy(prog->scop->tagged_reads);
	tagged = isl_union_map_union(tagged,
			    isl_union_map_copy(prog->scop->tagged_may_writes));

	may_write = isl_union_map_copy(prog->may_write);
	may_write = isl_union_map_intersect_domain(may_write,
					isl_union_set_copy(domain));
	may_write = remove_local_accesses(prog,
					isl_union_map_copy(tagged), may_write,
					isl_union_map_copy(prefix), 0);
	may_write = isl_union_map_apply_range(may_write,
					isl_union_map_copy(prog->to_outer));
	may_write = isl_union_map_apply_domain(may_write,
					isl_union_map_copy(prefix));
	may_write = approximate_copy_out(may_write, prog);
	copy_out = isl_union_map_copy(may_write);
	may_write = isl_union_map_apply_range(may_write,
					isl_union_map_copy(prog->to_inner));
	must_write = isl_union_map_copy(prog->must_write);
	must_write = isl_union_map_apply_domain(must_write,
					isl_union_map_copy(prefix));
	may_persist = node_may_persist(node, prog);
	may_write = isl_union_map_intersect_range(may_write, may_persist);
	not_written = isl_union_map_subtract(may_write, must_write);

	local = extract_local_accesses(prog, domain);
	read = isl_union_map_copy(prog->read);
	read = isl_union_map_intersect_domain(read, domain);
	read = remove_local_accesses(prog, tagged, read,
					isl_union_map_copy(prefix), 1);
	local = isl_union_set_apply(local, isl_union_map_copy(prog->to_inner));
	local_uninitialized = isl_union_map_copy(prog->scop->live_in);
	local_uninitialized = isl_union_map_intersect_range(local_uninitialized,
							    local);
	local_uninitialized = isl_union_map_intersect(local_uninitialized,
						    isl_union_map_copy(read));
	if (!isl_union_map_is_empty(local_uninitialized)) {
		fprintf(stderr,
			"possibly uninitialized reads (not copied in):\n");
		isl_union_map_dump(local_uninitialized);
	}
	read = isl_union_map_subtract(read, local_uninitialized);
	read = isl_union_map_apply_domain(read, prefix);
	copy_in = isl_union_map_union(read, not_written);
	copy_in = isl_union_map_apply_range(copy_in,
				    isl_union_map_copy(prog->to_outer));

	graft = create_copy_device(prog, node, "to_device",
						isl_union_map_range(copy_in));
	node = isl_schedule_node_graft_before(node, graft);
	graft = create_copy_device(prog, node, "from_device",
						isl_union_map_range(copy_out));
	node = isl_schedule_node_graft_after(node, graft);

	return node;
}

/* Add nodes for initializing ("init_device") and clearing ("clear_device")
 * the device before and after "node".
 */
static __isl_give isl_schedule_node *add_init_clear_device(
	__isl_take isl_schedule_node *node)
{
	isl_ctx *ctx;
	isl_space *space;
	isl_union_set *domain;
	isl_schedule_node *graft;

	ctx = isl_schedule_node_get_ctx(node);

	space = isl_space_set_alloc(ctx, 0, 0);
	space = isl_space_set_tuple_name(space, isl_dim_set, "init_device");
	domain = isl_union_set_from_set(isl_set_universe(space));
	graft = isl_schedule_node_from_domain(domain);

	node = isl_schedule_node_graft_before(node, graft);

	space = isl_space_set_alloc(ctx, 0, 0);
	space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device");
	domain = isl_union_set_from_set(isl_set_universe(space));
	graft = isl_schedule_node_from_domain(domain);

	node = isl_schedule_node_graft_after(node, graft);

	return node;
}

/* Update "schedule" for mapping to a GPU device.
 *
 * In particular, insert a context node, create kernels for
 * each outermost tilable band and introduce nodes for copying arrays
 * in and out of the device and for initializing and clearing the device.
 * If the child of the initial root points to a set node,
 * then children of this node that do not contain any tilable bands
 * are separated from the other children and are not mapped to
 * the device.
 *
 * The GPU code is generated in a context where at least one
 * statement instance is executed.  The corresponding guard is inserted
 * around the entire schedule.
 */
static __isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
	__isl_take isl_schedule *schedule)
{
	isl_schedule_node *node;
	isl_set *context;
	isl_set *guard;
	isl_union_set *domain;
	isl_union_map *prefix;
	isl_union_pw_multi_aff *contraction;
	struct gpu_prog *prog;

	context = isl_set_copy(gen->prog->context);
	context = isl_set_from_params(context);
	schedule = isl_schedule_insert_context(schedule, context);

	prog = gen->prog;
	guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
	prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
	guard = isl_set_from_params(guard);

	node = isl_schedule_get_root(schedule);
	isl_schedule_free(schedule);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	node = isolate_permutable_subtrees(node, gen->prog);
	domain = isl_schedule_node_get_domain(node);
	contraction = isl_schedule_node_get_subtree_contraction(node);
	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
				    isl_union_pw_multi_aff_copy(contraction));
	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
				    contraction);
	node = mark_kernels(gen, node);
	node = add_to_from_device(node, domain, prefix, gen->prog);
	node = isl_schedule_node_root(node);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_insert_guard(node, guard);
	node = isl_schedule_node_child(node, 0);
	node = add_init_clear_device(node);
	schedule = isl_schedule_node_get_schedule(node);
	isl_schedule_node_free(node);

	return schedule;
}

/* Internal data structure for extract_access.
 * "next_access" points to the end of a linked list that is extended
 * by extract_access.
 * "single_expression" is set if the access expressions belong to
 * an expression statement (i.e., a statement without internal control).
 * "any_to_outer" maps all intermediate arrays to their outer arrays.
 */
struct ppcg_extract_access_data {
	struct gpu_stmt_access **next_access;
	int single_expression;
	isl_union_map *any_to_outer;
};

/* Given a tagged access relation to a single array "tagged", extract it
 * as a map, taking into account that the input may be empty.
 * If the access relation is empty, then it does not contain
 * any space information, so we try to recover it from the index
 * expression.
 * The space of the index expression is of the form I -> A,
 * with I the statement instances and A the array, or [I -> F] -> A,
 * with F the filters corresponding to arguments.
 * We first drop F, if present, obtaining I -> A.
 * Then we construct I -> R, with R the reference tag,
 * combine the two into I -> [R -> A] and uncurry to obtain
 * the final result [I -> R] -> A.
 * Note that the index expression may have a lower dimension
 * than that of the array, but this dimension is not used
 * if the access relation is empty.
 */
static __isl_give isl_map *extract_single_tagged_access(
	__isl_take isl_union_map *tagged, __isl_keep pet_expr *expr)
{
	int empty;
	isl_id *id;
	isl_space *space, *space2;
	isl_multi_pw_aff *index;

	empty = isl_union_map_is_empty(tagged);
	if (empty < 0)
		goto error;
	if (!empty)
		return isl_map_from_union_map(tagged);
	isl_union_map_free(tagged);

	index = pet_expr_access_get_index(expr);
	space = isl_multi_pw_aff_get_space(index);
	isl_multi_pw_aff_free(index);
	if (isl_space_domain_is_wrapping(space))
		space = isl_space_domain_factor_domain(space);
	space2 = isl_space_copy(space);
	space2 = isl_space_from_domain(isl_space_domain(space));
	id = pet_expr_access_get_ref_id(expr);
	space2 = isl_space_set_tuple_id(space2, isl_dim_out, id);
	space = isl_space_range_product(space2, space);
	space = isl_space_uncurry(space);

	return isl_map_empty(space);
error:
	isl_union_map_free(tagged);
	return NULL;
}

/* Does the index expression "index" of "expr" represent an access
 * to a single element?
 * That is, is "index" completely specified?
 *
 * If "expr" accesses elements from different spaces (i.e., fields
 * of a structure), then it does not access a single element.
 * Otherwise, if the single space of the access matches the space
 * of "index", then the index expression is completely specified
 * (no pointer to a lower-dimensional slice of the accessed array)
 * and a single element is being accessed.
 */
static isl_bool complete_index(__isl_keep pet_expr *expr,
	__isl_keep isl_multi_pw_aff *index)
{
	isl_union_map *read, *write, *all;
	isl_map *map;
	isl_space *space1, *space2;
	isl_bool complete;

	read = pet_expr_access_get_may_read(expr);
	write = pet_expr_access_get_may_write(expr);
	all = isl_union_map_union(read, write);
	if (!all)
		return isl_bool_error;
	if (isl_union_map_n_map(all) != 1) {
		isl_union_map_free(all);
		return isl_bool_false;
	}
	map = isl_map_from_union_map(all);
	space1 = isl_map_get_space(map);
	isl_map_free(map);
	space2 = isl_multi_pw_aff_get_space(index);
	complete = isl_space_tuple_is_equal(space1, isl_dim_out,
					    space2, isl_dim_out);
	isl_space_free(space1);
	isl_space_free(space2);

	return complete;
}

/* Does "expr" access a single, fixed element (independently of the statement
 * instance)?
 * That is, does it have a completely specified constant index expression?
 *
 * Note that it is not sufficient for the index expression to be
 * piecewise constant.  isl_multi_pw_aff_is_cst can therefore not be used.
 */
static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr)
{
	int i, n;
	isl_multi_pw_aff *index;
	isl_bool fixed = isl_bool_true;

	index = pet_expr_access_get_index(expr);
	if (index < 0)
		return isl_bool_error;
	n = isl_multi_pw_aff_dim(index, isl_dim_out);
	for (i = 0; i < n; ++i) {
		isl_pw_aff *pa;

		pa = isl_multi_pw_aff_get_pw_aff(index, 0);
		fixed = isl_pw_aff_n_piece(pa) == 1;
		if (fixed)
			fixed = isl_pw_aff_is_cst(pa);
		isl_pw_aff_free(pa);
		if (fixed < 0 || !fixed)
			break;
	}
	if (fixed >= 0 && fixed)
		fixed = complete_index(expr, index);
	isl_multi_pw_aff_free(index);

	return fixed;
}

/* Extract a gpu_stmt_access from "expr", append it to the list
 * that ends in *data->next_access and update the end of the list.
 * If the access expression performs a write, then it is considered
 * exact only if it appears in a single expression statement and
 * if its may access relation is equal to its must access relation.
 *
 * The combined set of may accesses may be a union if member accesses
 * are involved, but the entire set is derived from a single reference and
 * therefore from a single index expression.  These accesses therefore
 * all map to the same outer array.
 */
static int extract_access(__isl_keep pet_expr *expr, void *user)
{
	struct ppcg_extract_access_data *data = user;
	isl_union_map *tagged;
	struct gpu_stmt_access *access;
	isl_ctx *ctx = pet_expr_get_ctx(expr);
	isl_multi_pw_aff *index;

	access = isl_alloc_type(ctx, struct gpu_stmt_access);
	if (!access)
		return -1;
	access->next = NULL;
	access->read = pet_expr_access_is_read(expr);
	access->write = pet_expr_access_is_write(expr);
	tagged = pet_expr_access_get_tagged_may_read(expr);
	tagged = isl_union_map_union(tagged,
				pet_expr_access_get_tagged_may_write(expr));
	tagged = isl_union_map_apply_range(tagged,
					isl_union_map_copy(data->any_to_outer));
	if (!access->write) {
		access->exact_write = 1;
	} else if (!data->single_expression) {
		access->exact_write = 0;
	} else {
		isl_union_map *must, *may;
		may = isl_union_map_copy(tagged);
		may = isl_union_map_domain_factor_domain(may);
		must = pet_expr_access_get_must_write(expr);
		access->exact_write = isl_union_map_is_equal(must, may);
		isl_union_map_free(must);
		isl_union_map_free(may);
	}
	index = pet_expr_access_get_index(expr);
	access->n_index = isl_multi_pw_aff_dim(index, isl_dim_out);
	isl_multi_pw_aff_free(index);
	access->ref_id = pet_expr_access_get_ref_id(expr);
	access->tagged_access = extract_single_tagged_access(tagged, expr);
	access->access = isl_map_copy(access->tagged_access);
	access->access = isl_map_domain_factor_domain(access->access);
	access->fixed_element = accesses_fixed_element(expr);

	*data->next_access = access;
	data->next_access = &(*data->next_access)->next;

	if (!access->access || access->fixed_element < 0)
		return -1;

	return 0;
}

/* Construct a linked list of gpu_stmt_access objects,
 * one for each access expression in the statement body.
 * "any_to_outer" maps all intermediate arrays to their outer arrays.
 */
static int pet_stmt_extract_accesses(struct gpu_stmt *stmt,
	__isl_keep isl_union_map *any_to_outer)
{
	struct ppcg_extract_access_data data;

	stmt->accesses = NULL;
	data.next_access = &stmt->accesses;
	data.single_expression =
		pet_tree_get_type(stmt->stmt->body) == pet_tree_expr;
	data.any_to_outer = any_to_outer;
	return pet_tree_foreach_access_expr(stmt->stmt->body,
						&extract_access, &data);
}

/* Has statement "stmt" been killed from "scop"?
 * That is, is the instance set of "scop" free from any
 * instances of "stmt"?
 */
static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt)
{
	isl_space *space;
	isl_set *left;
	isl_bool empty;

	if (!scop || !stmt)
		return isl_bool_error;
	space = isl_set_get_space(stmt->domain);
	left = isl_union_set_extract_set(scop->domain, space);
	empty = isl_set_plain_is_empty(left);
	isl_set_free(left);

	return empty;
}

/* Return an array of gpu_stmt representing the statements in "scop".
 * Do not collect array accesses for statements that have been killed.
 */
static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
	__isl_keep isl_union_map *any_to_outer)
{
	int i;
	struct gpu_stmt *stmts;

	stmts = isl_calloc_array(ctx, struct gpu_stmt, scop->pet->n_stmt);
	if (!stmts)
		return NULL;

	for (i = 0; i < scop->pet->n_stmt; ++i) {
		struct gpu_stmt *s = &stmts[i];
		isl_bool killed;

		s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
		s->stmt = scop->pet->stmts[i];
		killed = is_stmt_killed(scop, scop->pet->stmts[i]);
		if (killed < 0)
			return free_stmts(stmts, i + 1);
		if (killed)
			continue;
		if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
			return free_stmts(stmts, i + 1);
	}

	return stmts;
}

/* Generate CUDA code for "scop" and print it to "p".
 * After generating an AST for the transformed scop as explained below,
 * we call "gen->print" to print the AST in the desired output format
 * to "p".
 *
 * If it turns out that it does not make sense to generate GPU code,
 * then we generate CPU code instead.
 *
 * The declarations of the arrays that are visible outside of the scop
 * are printed outside of the code generated from the schedule,
 * because the generated code may involve a guard around the entire code.
 *
 * We first compute a schedule that respects the dependences
 * of the original program and select the outermost bands
 * of tilable dimensions that have at least one parallel loop.
 * If the --load-schedule is specified, then the loaded schedule
 * is used instead of a computed schedule.
 *
 * Each of these bands B is then tiled according to "tile" sizes, resulting
 * in two nested bands, with a kernel marker on top
 *
 *		K
 *		|
 *		T
 *		|
 *		P
 *
 * We then split off at most 2 parallel dimensions from the T band and
 * at most 3 parallel dimension from the P band
 *
 *		K
 *		|
 *		T
 *		T1
 *		|
 *		T2
 *		|
 *		P1
 *		|
 *		P2
 *
 * A filter is introduced in front of T1 that maps the domain instances
 * to block identifiers.  Similarly, a filter is introduced in front of P1
 * that maps the domain instances to thread identifiers.
 *
 * For each iteration of the T2 band and for each array, we compute
 * the array elements accessed by that iteration, construct a rectangular
 * box around it and shift it to the origin.  The result is used
 * as shared memory for the array.
 *
 * Copying and synchronization statements are added to this schedule tree.
 * In principle, these are added in front of the P1 band, but some of
 * them may get hoisted up to higher levels.
 *
 * The entire AST is then generated from the single resulting schedule tree.
 * During the generation the subtrees at kernel nodes (K) are saved
 * aside and replaced by kernel calls.  The result is printed as host code
 * while the saved subtrees are printed as device code.
 */
static __isl_give isl_printer *generate(__isl_take isl_printer *p,
	struct gpu_gen *gen, struct ppcg_scop *scop,
	struct ppcg_options *options)
{
	struct gpu_prog *prog;
	isl_ctx *ctx;
	isl_schedule *schedule;
	isl_bool any_permutable;

	if (!scop)
		return isl_printer_free(p);

	ctx = isl_printer_get_ctx(p);
	prog = gpu_prog_alloc(ctx, scop);
	if (!prog)
		return isl_printer_free(p);

	gen->prog = prog;
	schedule = get_schedule(gen);

	any_permutable = has_any_permutable_node(schedule);
	if (any_permutable < 0 || !any_permutable) {
		if (any_permutable < 0)
			p = isl_printer_free(p);
		else
			p = print_cpu(p, scop, options);
		isl_schedule_free(schedule);
	} else {
		schedule = map_to_device(gen, schedule);
		gen->tree = generate_code(gen, schedule);
		p = ppcg_set_macro_names(p);
		p = ppcg_print_exposed_declarations(p, prog->scop);
		p = gen->print(p, gen->prog, gen->tree, &gen->types,
				    gen->print_user);
		isl_ast_node_free(gen->tree);
	}

	gpu_prog_free(prog);

	return p;
}

/* Wrapper around generate for use as a ppcg_transform callback.
 */
static __isl_give isl_printer *generate_wrap(__isl_take isl_printer *p,
	struct ppcg_scop *scop, void *user)
{
	struct gpu_gen *gen = user;

	return generate(p, gen, scop, gen->options);
}

/* Transform the code in the file called "input" by replacing
 * all scops by corresponding GPU code and write the results to "out".
 */
int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
	struct ppcg_options *options,
	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
		struct gpu_types *types, void *user), void *user)
{
	struct gpu_gen gen;
	int r;
	int i;

	gen.ctx = ctx;
	gen.sizes = extract_sizes_from_str(ctx, options->sizes);
	gen.options = options;
	gen.kernel_id = 0;
	gen.print = print;
	gen.print_user = user;
	gen.types.n = 0;
	gen.types.name = NULL;

	if (options->debug->dump_sizes) {
		isl_space *space = isl_space_params_alloc(ctx, 0);
		gen.used_sizes = isl_union_map_empty(space);
	}

	r = ppcg_transform(ctx, input, out, options, &generate_wrap, &gen);

	if (options->debug->dump_sizes) {
		isl_union_map_dump(gen.used_sizes);
		isl_union_map_free(gen.used_sizes);
	}

	isl_union_map_free(gen.sizes);
	for (i = 0; i < gen.types.n; ++i)
		free(gen.types.name[i]);
	free(gen.types.name);

	return r;
}

/* Compute the set of inner array elements that may have their values
 * preserved by "prog".  In particular, collect the array elements of
 * arrays that are not local to "prog" and remove those elements that
 * are definitely killed or definitely written by "prog".
 */
static __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
{
	int i;
	isl_union_set *may_persist, *killed;
	isl_union_map *must_kill;

	may_persist = isl_union_set_empty(isl_set_get_space(prog->context));
	for (i = 0; i < prog->n_array; ++i) {
		isl_set *extent;

		if (prog->array[i].local)
			continue;

		extent = isl_set_copy(prog->array[i].extent);
		may_persist = isl_union_set_add_set(may_persist, extent);
	}

	may_persist = isl_union_set_intersect_params(may_persist,
						isl_set_copy(prog->context));
	may_persist = isl_union_set_apply(may_persist,
					isl_union_map_copy(prog->to_inner));
	must_kill = isl_union_map_copy(prog->tagged_must_kill);
	killed = isl_union_map_range(must_kill);
	must_kill = isl_union_map_copy(prog->must_write);
	killed = isl_union_set_union(killed, isl_union_map_range(must_kill));

	may_persist = isl_union_set_subtract(may_persist, killed);
	return may_persist;
}

struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop)
{
	struct gpu_prog *prog;
	isl_space *space;
	isl_map *id;

	if (!scop)
		return NULL;

	prog = isl_calloc_type(ctx, struct gpu_prog);
	if (!prog)
		return NULL;

	prog->ctx = ctx;
	prog->scop = scop;
	prog->context = isl_set_copy(scop->context);
	prog->n_stmts = scop->pet->n_stmt;
	prog->any_to_outer = pet_scop_compute_outer_to_any(scop->pet);
	prog->any_to_outer = isl_union_map_reverse(prog->any_to_outer);
	space = isl_union_map_get_space(prog->any_to_outer);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, 1);
	space = isl_space_map_from_set(space);
	id = isl_map_identity(space);
	prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
	prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer);
	prog->read = isl_union_map_copy(scop->reads);
	prog->may_write = isl_union_map_copy(scop->may_writes);
	prog->must_write = isl_union_map_copy(scop->must_writes);
	prog->tagged_must_kill = isl_union_map_copy(scop->tagged_must_kills);
	prog->to_inner = pet_scop_compute_outer_to_inner(scop->pet);
	prog->to_outer = isl_union_map_copy(prog->to_inner);
	prog->to_outer = isl_union_map_reverse(prog->to_outer);

	if (!prog->stmts)
		return gpu_prog_free(prog);

	if (collect_array_info(prog) < 0)
		return gpu_prog_free(prog);
	prog->may_persist = compute_may_persist(prog);

	return prog;
}

void *gpu_prog_free(struct gpu_prog *prog)
{
	if (!prog)
		return NULL;
	free_array_info(prog);
	free_stmts(prog->stmts, prog->n_stmts);
	isl_union_map_free(prog->any_to_outer);
	isl_union_map_free(prog->to_outer);
	isl_union_map_free(prog->to_inner);
	isl_union_map_free(prog->read);
	isl_union_map_free(prog->may_write);
	isl_union_map_free(prog->must_write);
	isl_union_map_free(prog->tagged_must_kill);
	isl_union_map_free(prog->array_order);
	isl_union_set_free(prog->may_persist);
	isl_set_free(prog->context);
	free(prog);
	return NULL;
}


================================================
FILE: src/ppcg_files/gpu.h
================================================
#ifndef _GPU_H
#define _GPU_H

#include <isl/ast.h>
#include <isl/id.h>
#include <isl/id_to_ast_expr.h>

#include <pet.h>

#include "ppcg.h"
#include "ppcg_options.h"

#ifdef __cplusplus
extern "C"
{
#endif

	/* An access to an outer array element or an iterator.
 * Accesses to iterators have an access relation that maps to an unnamed space.
 * An access may be both read and write.
 * If the access relation is empty, then the output dimension may
 * not be equal to the dimension of the corresponding array.
 */
	struct gpu_stmt_access
	{
		/* Access reads elements */
		int read;
		/* Access writes elements */
		int write;
		/* All writes are definite writes. */
		int exact_write;
		/* Is a single, fixed element being accessed? */
		isl_bool fixed_element;
		/* The number of index expressions specified in the access. */
		int n_index;

		/* May access relation */
		isl_map *access;
		/* May access relation with as domain a mapping from iteration domain
	 * to a reference identifier.
	 */
		isl_map *tagged_access;
		/* The reference id of the corresponding pet_expr. */
		isl_id *ref_id;

		struct gpu_stmt_access *next;
	};

	/* A representation of a user statement.
 * "stmt" points to the corresponding pet statement.
 * "id" is the identifier of the instance set of the statement.
 * "accesses" is a linked list of accesses performed by the statement.
 * If the statement has been killed, i.e., if it will not be scheduled,
 * then this linked list may be empty even if the actual statement does
 * perform accesses.
 */
	struct gpu_stmt
	{
		isl_id *id;
		struct pet_stmt *stmt;

		struct gpu_stmt_access *accesses;
	};

	/* Represents an outer array possibly accessed by a gpu_prog.
 */
	struct gpu_array_info
	{
		/* The array data space. */
		isl_space *space;
		/* Element type. */
		char *type;
		/* Element size. */
		int size;
		/* Name of the array. */
		char *name;
		/* Declared extent of original array. */
		isl_set *declared_extent;
		/* AST expression for declared size of original array. */
		isl_ast_expr *declared_size;
		/* Extent of the array that needs to be copied. */
		isl_set *extent;
		/* Number of indices. */
		unsigned n_index;
		/* For each index, a bound on "extent" in that direction. */
		isl_multi_pw_aff *bound;
		/* The corresponding access AST expression, if the array needs
	 * to be allocated on the device.
	 */
		isl_ast_expr *bound_expr;

		/* All references to this array; point to elements of a linked list. */
		int n_ref;
		struct gpu_stmt_access **refs;

		/* Is this array accessed at all by the program? */
		int accessed;

		/* Is this a scalar that is read-only within the entire program? */
		int read_only_scalar;

		/* Are the elements of the array structures? */
		int has_compound_element;

		/* Are the elements only accessed through constant index expressions? */
		int only_fixed_element;

		/* Is the array local to the scop? */
		int local;
		/* Is the array local and should it be declared on the host? */
		int declare_local;

		/* Is the corresponding global device memory accessed in any way? */
		int global;

		/* Should the array be linearized? */
		int linearize;

		/* Order dependences on this array.
	 * Only used if live_range_reordering option is set.
	 * It is set to NULL otherwise.
	 */
		isl_union_map *dep_order;
	};

	/* Represents an outer array accessed by a ppcg_kernel, localized
 * to the context of this kernel.
 *
 * "array" points to the corresponding array in the gpu_prog.
 * The "n_group" "groups" are the reference groups associated to the array.
 * If "force_private" is set, then the array (in practice a scalar)
 * must be mapped to a register.
 * "global" is set if the global device memory corresponding
 * to this array is accessed by the kernel.
 * "bound" is equal to array->bound specialized to the current kernel.
 * "bound_expr" is the corresponding access AST expression.
 */
	struct gpu_local_array_info
	{
		struct gpu_array_info *array;

		int n_group;
		struct gpu_array_ref_group **groups;

		int force_private;
		int global;

		unsigned n_index;
		isl_multi_pw_aff *bound;
		isl_ast_expr *bound_expr;
	};

	__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
			struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);

	/* A sequence of "n" names of types.
 */
	struct gpu_types
	{
		int n;
		char **name;
	};

	/* "read" and "write" contain the original access relations, possibly
 * involving member accesses.
 *
 * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
 * only refer to the outer arrays of any possible member accesses.
 */
	struct gpu_prog
	{
		isl_ctx *ctx;

		struct ppcg_scop *scop;

		/* Set of parameter values */
		isl_set *context;

		/* All potential read accesses in the entire program */
		isl_union_map *read;

		/* All potential write accesses in the entire program */
		isl_union_map *may_write;
		/* All definite write accesses in the entire program */
		isl_union_map *must_write;
		/* All tagged definite kills in the entire program */
		isl_union_map *tagged_must_kill;

		/* The set of inner array elements that may be preserved. */
		isl_union_set *may_persist;

		/* A mapping from all innermost arrays to their outer arrays. */
		isl_union_map *to_outer;
		/* A mapping from the outer arrays to all corresponding inner arrays. */
		isl_union_map *to_inner;
		/* A mapping from all intermediate arrays to their outer arrays,
	 * including an identity mapping from the anonymous 1D space to itself.
	 */
		isl_union_map *any_to_outer;

		/* Order dependences on non-scalars. */
		isl_union_map *array_order;

		/* Array of statements */
		int n_stmts;
		struct gpu_stmt *stmts;

		int n_array;
		struct gpu_array_info *array;
	};

	struct gpu_gen
	{
		isl_ctx *ctx;
		struct ppcg_options *options;

		/* Callback for printing of AST in appropriate format. */
		__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
																		 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
																		 struct gpu_types *types, void *user);
		void *print_user;

		struct gpu_prog *prog;
		/* The generated AST. */
		isl_ast_node *tree;

		/* The sequence of types for which a definition has been printed. */
		struct gpu_types types;

		/* User specified tile, grid and block sizes for each kernel */
		isl_union_map *sizes;

		/* Effectively used tile, grid and block sizes for each kernel */
		isl_union_map *used_sizes;

		/* Identifier of the next kernel. */
		int kernel_id;
	};

	enum ppcg_group_access_type
	{
		ppcg_access_global,
		ppcg_access_shared,
		ppcg_access_private
	};

	enum ppcg_kernel_stmt_type
	{
		ppcg_kernel_copy,
		ppcg_kernel_domain,
		ppcg_kernel_sync
	};

	/* Representation of special statements, in particular copy statements
 * and __syncthreads statements, inside a kernel.
 *
 * type represents the kind of statement
 *
 *
 * for ppcg_kernel_copy statements we have
 *
 * read is set if the statement should copy data from global memory
 * to shared memory or registers.
 *
 * index expresses an access to the array element that needs to be copied
 * local_index expresses the corresponding element in the tile
 *
 * array refers to the original array being copied
 * local_array is a pointer to the appropriate element in the "array"
 *	array of the ppcg_kernel to which this copy access belongs
 *
 *
 * for ppcg_kernel_domain statements we have
 *
 * stmt is the corresponding input statement
 *
 * n_access is the number of accesses in stmt
 * access is an array of local information about the accesses
 */
	struct ppcg_kernel_stmt
	{
		enum ppcg_kernel_stmt_type type;

		union {
			struct
			{
				int read;
				isl_ast_expr *index;
				isl_ast_expr *local_index;
				struct gpu_array_info *array;
				struct gpu_local_array_info *local_array;
			} c;
			struct
			{
				struct gpu_stmt *stmt;
				isl_id_to_ast_expr *ref2expr;
			} d;
		} u;
	};

	/* Representation of a local variable in a kernel.
 */
	struct ppcg_kernel_var
	{
		struct gpu_array_info *array;
		enum ppcg_group_access_type type;
		char *name;
		isl_vec *size;
	};

	/* Representation of a kernel.
 *
 * prog describes the original code from which the kernel is extracted.
 *
 * id is the sequence number of the kernel.
 *
 * block_ids contains the list of block identifiers for this kernel.
 * thread_ids contains the list of thread identifiers for this kernel.
 *
 * the first n_grid elements of grid_dim represent the specified size
 * of the grid.
 * the first n_block elements of block_dim represent the specified or
 * effective size of the block.
 * Note that in the input file, the sizes of the grid and the blocks
 * are specified in the order x, y, z, but internally, the sizes
 * are stored in reverse order, so that the last element always
 * refers to the x dimension.
 *
 * grid_size reflects the effective grid size.
 * grid_size_expr contains a corresponding access AST expression, built within
 * the context where the launch appears.
 *
 * context contains the values of the parameters and outer schedule dimensions
 * for which any statement instance in this kernel needs to be executed.
 *
 * n_sync is the number of synchronization operations that have
 * been introduced in the schedule tree corresponding to this kernel (so far).
 *
 * core contains the spaces of the statement domains that form
 * the core computation of the kernel.  It is used to navigate
 * the tree during the construction of the device part of the schedule
 * tree in gpu_create_kernel.
 *
 * expanded_domain contains the original statement instances,
 * i.e., those that appear in the domains of access relations,
 * that are involved in the kernel.
 * contraction maps those original statement instances to
 * the statement instances that are active at the point
 * in the schedule tree where the kernel is created.
 *
 * arrays is the set of possibly accessed outer array elements.
 *
 * space is the schedule space of the AST context.  That is, it represents
 * the loops of the generated host code containing the kernel launch.
 *
 * n_array is the total number of arrays in the input program and also
 * the number of element in the array array.
 * array contains information about each array that is local
 * to the current kernel.  If an array is not used in a kernel,
 * then the corresponding entry does not contain any information.
 *
 * any_force_private is set if any array in the kernel is marked force_private
 *
 * block_filter contains constraints on the domain elements in the kernel
 * that encode the mapping to block identifiers, where the block identifiers
 * are represented by "n_grid" parameters with as names the elements
 * of "block_ids".
 *
 * thread_filter contains constraints on the domain elements in the kernel
 * that encode the mapping to thread identifiers, where the thread identifiers
 * are represented by "n_block" parameters with as names the elements
 * of "thread_ids".
 *
 * copy_schedule corresponds to the schedule dimensions of
 * the (tiled) schedule for this kernel that have been taken into account
 * for computing private/shared memory tiles.
 * The domain corresponds to the original statement instances, i.e.,
 * those that appear in the leaves of the schedule tree.
 * copy_schedule_dim is the dimension of this schedule.
 *
 * sync_writes contains write references that require synchronization.
 * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
 * with S[i,j] the statement instance space and R[] the array reference.
 */
	struct ppcg_kernel
	{
		isl_ctx *ctx;
		struct ppcg_options *options;

		struct gpu_prog *prog;

		int id;

		isl_id_list *block_ids;
		isl_id_list *thread_ids;

		int n_grid;
		int n_block;
		int grid_dim[2];
		int block_dim[3];

		isl_multi_pw_aff *grid_size;
		isl_ast_expr *grid_size_expr;
		isl_set *context;

		int n_sync;
		isl_union_set *core;
		isl_union_set *arrays;

		isl_union_pw_multi_aff *contraction;
		isl_union_set *expanded_domain;

		isl_space *space;

		int n_array;
		struct gpu_local_array_info *array;

		int n_var;
		struct ppcg_kernel_var *var;

		int any_force_private;

		isl_union_set *block_filter;
		isl_union_set *thread_filter;
		isl_union_pw_multi_aff *copy_schedule;
		int copy_schedule_dim;

		isl_union_set *sync_writes;

		isl_ast_node *tree;
	};

	int gpu_array_is_scalar(struct gpu_array_info *array);
	int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
	int gpu_array_requires_device_allocation(struct gpu_array_info *array);
	__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
	isl_bool gpu_array_can_be_private(struct gpu_array_info *array);

	struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
	void *gpu_prog_free(struct gpu_prog *prog);

	int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);

	int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
									 struct ppcg_options *options,
									 __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
																										struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
																										struct gpu_types *types, void *user),
									 void *user);

	__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
																									__isl_take isl_schedule_node *node, int scale,
																									__isl_keep isl_multi_val *sizes);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/ppcg_files/gpu_array_tile.c
================================================
#include <isl/aff.h>
#include <isl/map.h>

#include "gpu_array_tile.h"

struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile)
{
	int j;

	if (!tile)
		return NULL;

	for (j = 0; j < tile->n; ++j) {
		isl_val_free(tile->bound[j].size);
		isl_val_free(tile->bound[j].stride);
		isl_aff_free(tile->bound[j].lb);
		isl_aff_free(tile->bound[j].shift);
	}
	free(tile->bound);
	isl_multi_aff_free(tile->tiling);
	free(tile);

	return NULL;
}

/* Create a gpu_array_tile for an array of dimension "n_index".
 */
struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index)
{
	int i;
	struct gpu_array_tile *tile;

	tile = isl_calloc_type(ctx, struct gpu_array_tile);
	if (!tile)
		return NULL;

	tile->ctx = ctx;
	tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
	if (!tile->bound)
		return gpu_array_tile_free(tile);

	tile->n = n_index;

	for (i = 0; i < n_index; ++i) {
		tile->bound[i].size = NULL;
		tile->bound[i].lb = NULL;
		tile->bound[i].stride = NULL;
		tile->bound[i].shift = NULL;
	}

	return tile;
}

/* Compute the size of the tile specified by "tile"
 * in number of elements and return the result.
 */
__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile)
{
	int i;
	isl_val *size;

	if (!tile)
		return NULL;

	size = isl_val_one(tile->ctx);

	for (i = 0; i < tile->n; ++i)
		size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));

	return size;
}


================================================
FILE: src/ppcg_files/gpu_array_tile.h
================================================
#ifndef GPU_ARRAY_TILE_H
#define GPU_ARRAY_TILE_H

#include <isl/aff_type.h>
#include <isl/map_type.h>
#include <isl/val.h>

/* The current index is such that if you add "shift",
 * then the result is always a multiple of "stride",
 * where "stride" may be equal to 1.
 * Let D represent the initial tile->depth dimensions of the computed schedule.
 * The spaces of "lb" and "shift" are of the form
 *
 *	D -> [b]
 */
struct gpu_array_bound
{
	isl_val *size;
	isl_aff *lb;

	isl_val *stride;
	isl_aff *shift;
};

/* A tile of an outer array.
 *
 * requires_unroll is set if the schedule dimensions that are mapped
 * to threads need to be unrolled for this (private) tile to be used.
 *
 * "depth" reflects the number of schedule dimensions that affect the tile.
 * The copying into and/or out of the tile is performed at that depth.
 *
 * n is the dimension of the array.
 * bound is an array of size "n" representing the lower bound
 *	and size for each index.
 *
 * tiling maps a tile in the global array to the corresponding
 * shared/private memory tile and is of the form
 *
 *	{ [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
 *
 * where D represents the initial "depth" dimensions
 * of the computed schedule.
 */
struct gpu_array_tile
{
	isl_ctx *ctx;
	int requires_unroll;
	int depth;
	int n;
	struct gpu_array_bound *bound;
	isl_multi_aff *tiling;
};

struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index);
struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile);

__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile);

#endif


================================================
FILE: src/ppcg_files/gpu_group.c
================================================
/*
 * Copyright 2010-2011 INRIA Saclay
 * Copyright 2012-2014 Ecole Normale Superieure
 * Copyright 2015      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <isl/aff.h>
#include <isl/map.h>
#include <isl/constraint.h>

#include "gpu_array_tile.h"
#include "gpu_group.h"
#include "gpu_tree.h"
#include "schedule.h"

/* Print the name of the local copy of a given group of array references.
 */
__isl_give isl_printer *gpu_array_ref_group_print_name(
	struct gpu_array_ref_group *group, __isl_take isl_printer *p)
{
	int global = 0;
	enum ppcg_group_access_type type;

	type = gpu_array_ref_group_type(group);
	if (type == ppcg_access_private)
		p = isl_printer_print_str(p, "private_");
	else if (type == ppcg_access_shared)
		p = isl_printer_print_str(p, "shared_");
	else
		global = 1;
	p = isl_printer_print_str(p, group->array->name);
	if (!global && group->local_array->n_group > 1) {
		p = isl_printer_print_str(p, "_");
		p = isl_printer_print_int(p, group->nr);
	}

	return p;
}

/* Return the union of all read (read = 1) and/or write (write = 1)
 * access relations in the group.
 */
__isl_give isl_union_map *gpu_array_ref_group_access_relation(
	struct gpu_array_ref_group *group, int read, int write)
{
	int i;
	isl_union_map *access;

	access = isl_union_map_empty(isl_map_get_space(group->access));
	for (i = 0; i < group->n_ref; ++i) {
		isl_map *map_i;

		if (!((read && group->refs[i]->read) ||
		     (write && group->refs[i]->write)))
			continue;
		map_i = isl_map_copy(group->refs[i]->access);
		access = isl_union_map_union(access,
					    isl_union_map_from_map(map_i));
	}

	return access;
}

/* Should this array reference group be mapped to private, shared or global
 * memory?
 * If we have computed both a private and a shared tile, then
 * the tile with the smallest depth is used.  If both have the same depth,
 * then the private tile is used.
 */
enum ppcg_group_access_type gpu_array_ref_group_type(
	struct gpu_array_ref_group *group)
{
	if (group->private_tile && group->shared_tile &&
	    group->shared_tile->depth < group->private_tile->depth)
		return ppcg_access_shared;
	if (group->private_tile)
		return ppcg_access_private;
	if (group->shared_tile)
		return ppcg_access_shared;
	return ppcg_access_global;
}


/* Return the effective gpu_array_tile associated to "group" or
 * NULL if there is no such gpu_array_tile.
 */
struct gpu_array_tile *gpu_array_ref_group_tile(
	struct gpu_array_ref_group *group)
{
	switch (gpu_array_ref_group_type(group)) {
	case ppcg_access_global:
		return NULL;
	case ppcg_access_shared:
		return group->shared_tile;
	case ppcg_access_private:
		return group->private_tile;
	}
}

/* Does the tile associated to "group" require unrolling of the schedule
 * dimensions mapped to threads?
 * Note that this can only happen for private tiles.
 */
int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group)
{
	struct gpu_array_tile *tile;

	tile = gpu_array_ref_group_tile(group);
	if (!tile)
		return 0;
	return tile->requires_unroll;
}

/* Given an array access "access", check if for any index i there is
 * a shift a(p) and a stride g such that
 *
 *	a(p) + i = 0 mod g
 *
 * If so, record the information in tile->bound[i]->stride and
 * tile->bound[i]->shift.
 * Otherwise, set tile->bound[i]->stride to 1 (and tile->bound[i]->shift to 0).
 * Return isl_bool_true if any non-trivial stride was found.
 *
 * Note that the stride info returned by isl_map_get_range_stride_info
 * is of the form
 *
 *	i = o(p) + g n
 *
 * a(p) can therefore be taken to be equal to -o(p).
 */
static isl_bool detect_strides(struct gpu_array_tile *tile,
	__isl_keep isl_map *access)
{
	int i;
	isl_bool has_strides = isl_bool_false;

	for (i = 0; i < tile->n; ++i) {
		struct gpu_array_bound *bound = &tile->bound[i];
		isl_stride_info *si;

		si = isl_map_get_range_stride_info(access, i);
		bound->stride = isl_stride_info_get_stride(si);
		bound->shift = isl_aff_neg(isl_stride_info_get_offset(si));
		isl_stride_info_free(si);

		if (!has_strides)
			has_strides = isl_val_gt_si(bound->stride, 1);
		if (has_strides < 0)
			return isl_bool_error;
	}

	return has_strides;
}

/* Given an array access "access", remove the strides based
 * on the information in tile->bound[i]->stride and tile->bound[i]->shift.
 *
 * In particular let the access be A[a] and
 * let the shifts s_i(p) and the strides g_i be such that
 *
 *  S(p) + a = 0 mod G
 *
 * Replace the access by
 *
 *  A[(a + S(p))/G]
 *
 * First collect the shifts s_i into an isl_multi_aff and
 * the strides into the scaling function A[i] -> A[G i].
 * Then add the shifts to the original access and
 * take the preimage over the scaling.
 */
static __isl_give isl_map *remove_strides(__isl_take isl_map *access,
	struct gpu_array_tile *tile)
{
	int i;
	isl_space *space;
	isl_multi_aff *shift, *scale;
	isl_multi_val *stride;

	space = isl_map_get_space(access);
	shift = isl_multi_aff_zero(isl_space_copy(space));
	space = isl_space_range(space);
	stride = isl_multi_val_zero(isl_space_copy(space));
	scale = isl_multi_aff_identity(isl_space_map_from_set(space));
	for (i = 0; i < tile->n; ++i) {
		struct gpu_array_bound *bound = &tile->bound[i];
		isl_aff *shift_i;
		isl_val *stride_i;

		shift_i = isl_aff_copy(bound->shift);
		stride_i = isl_val_copy(bound->stride);
		shift = isl_multi_aff_set_aff(shift, i, shift_i);
		stride = isl_multi_val_set_val(stride, i, stride_i);
	}
	scale = isl_multi_aff_scale_multi_val(scale, stride);

	access = isl_map_sum(access, isl_map_from_multi_aff(shift));
	access = isl_map_preimage_range_multi_aff(access, scale);

	return access;
}

/* Check if we can find a memory tile for the given array
 * based on the given accesses, and if so, put the results in "tile".
 *
 * We project the accesses on each index in turn and look for a parametric
 * offset such that the size is constant, after removing
 * any stride that may appear in the accesses.
 *
 * tile->depth is initialized to the input dimension of the computed bounds.
 */
static isl_bool can_tile(__isl_keep isl_map *access,
	struct gpu_array_tile *tile)
{
	int i;
	isl_bool has_strides, valid;
	isl_fixed_box *box;
	isl_multi_aff *offset;
	isl_multi_val *size;

	if (!tile)
		return isl_bool_error;

	isl_map_free(isl_map_detect_equalities(isl_map_copy(access)));

	has_strides = detect_strides(tile, access);
	if (has_strides < 0)
		return isl_bool_error;

	tile->depth = isl_map_dim(access, isl_dim_in);

	access = isl_map_copy(access);
	if (has_strides)
		access = remove_strides(access, tile);

	box = isl_map_get_range_simple_fixed_box_hull(access);
	isl_map_free(access);

	valid = isl_fixed_box_is_valid(box);
	if (valid >= 0 && valid) {
		offset = isl_fixed_box_get_offset(box);
		size = isl_fixed_box_get_size(box);
		for (i = 0; i < tile->n; ++i) {
			tile->bound[i].size = isl_multi_val_get_val(size, i);
			tile->bound[i].lb = isl_multi_aff_get_aff(offset, i);
		}
		isl_multi_aff_free(offset);
		isl_multi_val_free(size);
	}
	isl_fixed_box_free(box);

	return valid;
}

/* Internal data structure for gpu_group_references.
 *
 * scop represents the input scop.
 * kernel_depth is the schedule depth where the kernel launch will
 * be introduced, i.e., it is the depth of the band that is mapped
 * to blocks.
 * shared_depth is the schedule depth at which the copying to/from
 * shared memory is computed.  The copy operation may then
 * later be hoisted to a higher level.
 * thread_depth is the schedule depth where the thread mark is located,
 * i.e., it is the depth of the band that is mapped to threads and also
 * the schedule depth at which the copying to/from private memory
 * is computed.  The copy operation may then later be hoisted to
 * a higher level.
 * n_thread is the number of schedule dimensions in the band that
 * is mapped to threads.
 * privatization lives in the range of thread_sched (i.e., it is
 * of dimension thread_depth + n_thread) and encodes the mapping
 * to thread identifiers (as parameters).
 * host_sched contains the kernel_depth dimensions of the host schedule.
 * shared_sched contains the first shared_depth dimensions of the
 * kernel schedule.
 * copy_sched contains the first thread_depth dimensions of the
 * kernel schedule.
 * thread_sched contains the first (thread_depth + n_thread) dimensions
 * of the kernel schedule.
 * full_sched is a union_map representation of the entire kernel schedule.
 * The schedules are all formulated in terms of the original statement
 * instances, i.e., those that appear in the domains of the access
 * relations.
 */
struct gpu_group_data {
	struct ppcg_scop *scop;
	int kernel_depth;
	int shared_depth;
	int thread_depth;
	int n_thread;
	isl_set *privatization;
	isl_union_map *host_sched;
	isl_union_map *shared_sched;
	isl_union_map *copy_sched;
	isl_union_map *thread_sched;
	isl_union_map *full_sched;
};

/* Construct a map from domain_space to domain_space that increments
 * the dimension at position "pos" and leaves all other dimensions
 * constant.
 */
static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos)
{
	isl_space *space;
	isl_aff *aff;
	isl_multi_aff *next;

	space = isl_space_map_from_set(domain_space);
	next = isl_multi_aff_identity(space);
	aff = isl_multi_aff_get_aff(next, pos);
	aff = isl_aff_add_constant_si(aff, 1);
	next = isl_multi_aff_set_aff(next, pos, aff);

	return isl_map_from_multi_aff(next);
}

/* Check if the given access is coalesced (or if there is no point
 * in trying to coalesce the access by mapping the array to shared memory).
 * That is, check whether incrementing the dimension that will get
 * wrapped over the last thread index results in incrementing
 * the last array index.
 *
 * If no two consecutive array elements are ever accessed by "access",
 * then mapping the corresponding array to shared memory will not
 * improve coalescing.  In fact, the copying will likely be performed
 * by a single thread.  Consider the access as coalesced such that
 * the caller will not try and map the array to shared memory just
 * to improve coalescing.
 *
 * This function is only called for access relations without reuse and
 * kernels with at least one thread identifier.
 */
static int access_is_coalesced(struct gpu_group_data *data,
	__isl_keep isl_union_map *access)
{
	int dim;
	isl_space *space;
	isl_set *accessed;
	isl_map *access_map;
	isl_map *next_thread_x;
	isl_map *next_element;
	isl_map *map;
	int coalesced, empty;

	access = isl_union_map_copy(access);
	access = isl_union_map_apply_domain(access,
				isl_union_map_copy(data->full_sched));
	access_map = isl_map_from_union_map(access);

	space = isl_map_get_space(access_map);
	space = isl_space_range(space);
	dim = isl_space_dim(space, isl_dim_set);
	if (dim == 0)
		next_element = isl_map_empty(isl_space_map_from_set(space));
	else
		next_element = next(space, dim - 1);

	accessed = isl_map_range(isl_map_copy(access_map));
	map = isl_map_copy(next_element);
	map = isl_map_intersect_domain(map, isl_set_copy(accessed));
	map = isl_map_intersect_range(map, accessed);
	empty = isl_map_is_empty(map);
	isl_map_free(map);

	if (empty < 0 || empty) {
		isl_map_free(next_element);
		isl_map_free(access_map);
		return empty;
	}

	space = isl_map_get_space(access_map);
	space = isl_space_domain(space);
	next_thread_x = next(space, data->thread_depth + data->n_thread - 1);

	map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
	map = isl_map_apply_range(map, access_map);

	coalesced = isl_map_is_subset(map, next_element);

	isl_map_free(next_element);
	isl_map_free(map);

	return coalesced;
}

/* Replace the host schedule dimensions in the access relation "access"
 * by parameters, so that they are treated as fixed when checking for reuse
 * (within a kernel) or whether two consecutive elements are accessed
 * (within a kernel).
 */
static __isl_give isl_union_map *localize_access(struct gpu_group_data *data,
	__isl_take isl_union_map *access)
{
	int n;
	isl_space *space;
	isl_set *param;
	isl_union_map *umap;
	isl_id_list *ids;

	umap = isl_union_map_copy(data->host_sched);
	space = isl_union_map_get_space(umap);
	n = data->kernel_depth;
	ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_");
	param = parametrization(space, n, 0, ids);
	isl_id_list_free(ids);
	umap = isl_union_map_intersect_range(umap,
						isl_union_set_from_set(param));
	access = isl_union_map_intersect_domain(access,
						isl_union_map_domain(umap));

	return access;
}

/* Given an access relation in terms of at least data->thread_depth initial
 * dimensions of the computed schedule, check if it is bijective for
 * fixed values of the first data->thread_depth dimensions.
 * We perform this check by equating these dimensions to parameters.
 */
static int access_is_bijective(struct gpu_group_data *data,
	__isl_keep isl_map *access)
{
	int res;
	int dim;
	isl_set *par;
	isl_space *space;
	isl_id_list *ids;

	access = isl_map_copy(access);
	space = isl_space_params(isl_map_get_space(access));
	ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s");
	dim = isl_map_dim(access, isl_dim_in);
	par = parametrization(space, dim, 0, ids);
	isl_id_list_free(ids);
	access = isl_map_intersect_domain(access, par);
	res = isl_map_is_bijective(access);
	isl_map_free(access);

	return res;
}

/* Compute the number of outer schedule tile dimensions that affect
 * the offset of "tile".
 * If there is no such dimension, then return the index
 * of the first kernel dimension, i.e., data->kernel_depth.
 */
static int compute_tile_depth(struct gpu_group_data *data,
	struct gpu_array_tile *tile)
{
	int i, j;

	for (j = tile->depth - 1; j >= data->kernel_depth; --j) {
		for (i = 0; i < tile->n; ++i) {
			isl_aff *lb;
			isl_aff *shift;

			lb = tile->bound[i].lb;
			if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
				break;

			shift = tile->bound[i].shift;
			if (!shift)
				continue;
			if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
				break;
		}
		if (i < tile->n)
			break;
	}

	return ++j;
}

/* Return the lowest depth between data->kernel_depth and data->thread_depth
 * at which every array element accessed through "acc" is accessed
 * by a single thread.  The input dimension of "acc" is
 * data->thread_depth + data->n_thread, where the final data->n_thread
 * dimensions are those that will be mapped to threads.
 * If the values for these dimensions are uniquely determined
 * by the array index and a given number of outer dimensions, then
 * there is only one thread accessing that array element within those
 * outer dimensions.
 *
 * The input space of "acc" is first split up, such that it has the form
 *
 *	[O -> T] -> A
 *
 * with O the outer dimensions, T the dimensions that will be mapped to threads
 * and A the array index.
 *
 * Then the positions of T and A are interchanged to simplify the test
 * whether T uniquely depends on O and A.
 * In particular, the above access relation is first combined with
 *
 *	[O -> T] -> T
 *
 * to form
 *
 *	[O -> T] -> [A -> T]
 *
 * from which
 *
 *	O -> [A -> T]
 *
 * is extracted, which is then uncurried to
 *
 *	[O -> A] -> T
 *
 * Finally, the final dimensions of O are projected out one by one
 * until T is no longer uniquely determined by A and the remaining
 * dimensions in O.  The value returned is that of the last dimension
 * that was successfully projected out.
 * Note that there is no need to test whether [O -> A] -> T itself
 * is single-valued as that was already tested in access_is_bijective.
 */
static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data,
	__isl_keep isl_map *acc)
{
	int i;
	isl_space *space;
	isl_map *map;
	isl_bool sv;

	if (data->thread_depth == data->kernel_depth)
		return data->thread_depth;

	acc = isl_map_copy(acc);

	space = isl_map_get_space(acc);
	space = isl_space_params(space);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, data->thread_depth);
	space = isl_space_from_domain(space);
	space = isl_space_add_dims(space, isl_dim_out, data->n_thread);
	space = isl_space_wrap(space);
	map = isl_set_flatten_map(isl_set_universe(space));
	acc = isl_map_apply_range(map, acc);

	space = isl_space_domain(isl_map_get_space(acc));
	map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
	acc = isl_map_range_product(acc, map);
	acc = isl_map_domain_factor_domain(acc);
	acc = isl_map_uncurry(acc);

	for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) {
		acc = isl_map_project_out(acc, isl_dim_in, i, 1);
		sv = isl_map_is_single_valued(acc);
		if (sv < 0)
			goto error;
		if (!sv)
			break;
	}

	isl_map_free(acc);

	return ++i;
error:
	isl_map_free(acc);
	return -1;
}

/* Adjust the fields of "tile" to reflect the new input dimension "depth".
 * The dimension beyond "depth" are assumed not to affect the tile,
 * so they can simply be dropped.
 */
static int tile_adjust_depth(struct gpu_array_tile *tile, int depth)
{
	int i;

	if (tile->depth == depth)
		return 0;

	for (i = 0; i < tile->n; ++i) {
		tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb,
					isl_dim_in, depth, tile->depth - depth);
		if (!tile->bound[i].lb)
			return -1;
		if (!tile->bound[i].shift)
			continue;
		tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift,
					isl_dim_in, depth, tile->depth - depth);
		if (!tile->bound[i].shift)
			return -1;
	}

	tile->depth = depth;

	return 0;
}

/* Determine the number of schedule dimensions that affect the offset of the
 * shared or private tile "tile" and store the result in tile->depth, with
 * a lower bound of data->kernel_depth.
 * Also adjust the fields of the tile to only refer to the tile->depth
 * outer schedule dimensions.
 */
static isl_stat tile_set_depth(struct gpu_group_data *data,
	struct gpu_array_tile *tile)
{
	if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0)
		return isl_stat_error;

	return isl_stat_ok;
}

/* Determine the number of schedule dimensions that affect the offset of the
 * shared tile and store the minimum of the private and shared tile depth
 * in group->min_depth, with a lower bound of data->kernel_depth.
 * If there is no tile defined on the array reference group,
 * then set group->min_depth to data->thread_depth.
 */
static int set_depth(struct gpu_group_data *data,
	struct gpu_array_ref_group *group)
{
	group->min_depth = data->thread_depth;

	if (group->private_tile) {
		if (group->private_tile->depth < group->min_depth)
			group->min_depth = group->private_tile->depth;
	}
	if (group->shared_tile) {
		if (tile_set_depth(data, group->shared_tile) < 0)
			return -1;
		if (group->shared_tile->depth < group->min_depth)
			group->min_depth = group->shared_tile->depth;
	}

	return 0;
}

/* Fill up the groups array with singleton groups, i.e., one group
 * per reference, initializing the array, access, write, n_ref and refs fields.
 * In particular the access field is initialized to the scheduled
 * access relation of the array reference.
 *
 * Return the number of elements initialized, i.e., the number of
 * active references in the current kernel.
 */
static int populate_array_references(struct gpu_local_array_info *local,
	struct gpu_array_ref_group **groups, struct gpu_group_data *data)
{
	int i;
	int n;
	isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched);

	n = 0;
	for (i = 0; i < local->array->n_ref; ++i) {
		isl_union_map *umap;
		isl_map *map;
		struct gpu_array_ref_group *group;
		struct gpu_stmt_access *access = local->array->refs[i];

		map = isl_map_copy(access->access);
		umap = isl_union_map_from_map(map);
		umap = isl_union_map_apply_domain(umap,
				isl_union_map_copy(data->copy_sched));

		if (isl_union_map_is_empty(umap)) {
			isl_union_map_free(umap);
			continue;
		}

		map = isl_map_from_union_map(umap);
		map = isl_map_detect_equalities(map);

		group = isl_calloc_type(ctx, struct gpu_array_ref_group);
		if (!group) {
			isl_map_free(map);
			return -1;
		}
		group->local_array = local;
		group->array = local->array;
		group->access = map;
		group->write = access->write;
		group->exact_write = access->exact_write;
		group->slice = access->n_index < local->array->n_index;
		group->refs = &local->array->refs[i];
		group->n_ref = 1;

		groups[n++] = group;
	}

	return n;
}

/* If group->n_ref == 1, then group->refs was set by
 * populate_array_references to point directly into
 * group->array->refs and should not be freed.
 * If group->n_ref > 1, then group->refs was set by join_groups
 * to point to a newly allocated array.
 */
struct gpu_array_ref_group *gpu_array_ref_group_free(
	struct gpu_array_ref_group *group)
{
	if (!group)
		return NULL;
	gpu_array_tile_free(group->shared_tile);
	gpu_array_tile_free(group->private_tile);
	isl_map_free(group->access);
	if (group->n_ref > 1)
		free(group->refs);
	free(group);
	return NULL;
}

/* Check if the access relations of group1 and group2 overlap within
 * copy_sched.
 */
static int accesses_overlap(struct gpu_array_ref_group *group1,
	struct gpu_array_ref_group *group2)
{
	int disjoint;

	disjoint = isl_map_is_disjoint(group1->access, group2->access);
	if (disjoint < 0)
		return -1;

	return !disjoint;
}

/* Combine the given two groups into a single group, containing
 * the references of both groups.
 */
static struct gpu_array_ref_group *join_groups(
	struct gpu_array_ref_group *group1,
	struct gpu_array_ref_group *group2)
{
	int i;
	isl_ctx *ctx;
	struct gpu_array_ref_group *group;

	if (!group1 || !group2)
		return NULL;

	ctx = isl_map_get_ctx(group1->access);
	group = isl_calloc_type(ctx, struct gpu_array_ref_group);
	if (!group)
		return NULL;
	group->local_array = group1->local_array;
	group->array = group1->array;
	group->access = isl_map_union(isl_map_copy(group1->access),
					isl_map_copy(group2->access));
	group->write = group1->write || group2->write;
	group->exact_write = group1->exact_write && group2->exact_write;
	group->slice = group1->slice || group2->slice;
	group->n_ref = group1->n_ref + group2->n_ref;
	group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
					group->n_ref);
	if (!group->refs)
		return gpu_array_ref_group_free(group);
	for (i = 0; i < group1->n_ref; ++i)
		group->refs[i] = group1->refs[i];
	for (i = 0; i < group2->n_ref; ++i)
		group->refs[group1->n_ref + i] = group2->refs[i];

	return group;
}

/* Combine the given two groups into a single group and free
 * the original two groups.
 */
static struct gpu_array_ref_group *join_groups_and_free(
	struct gpu_array_ref_group *group1,
	struct gpu_array_ref_group *group2)
{
	struct gpu_array_ref_group *group;

	group = join_groups(group1, group2);
	gpu_array_ref_group_free(group1);
	gpu_array_ref_group_free(group2);
	return group;
}

/* Report that the array reference group with the given access relation
 * is not mapped to shared memory in the given kernel because
 * it does not exhibit any reuse and is considered to be coalesced.
 */
static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel,
	__isl_keep isl_union_map *access)
{
	isl_ctx *ctx;
	isl_printer *p;

	ctx = isl_union_map_get_ctx(access);
	p = isl_printer_to_file(ctx, stdout);
	p = isl_printer_print_str(p, "Array reference group ");
	p = isl_printer_print_union_map(p, access);
	p = isl_printer_print_str(p,
	    " not considered for mapping to shared memory in kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p,
	    " because it exhibits no reuse and is considered to be coalesced");
	p = isl_printer_end_line(p);
	isl_printer_free(p);
}

/* Given an access relation in terms of the data->thread_depth initial
 * dimensions of the computed schedule and the thread identifiers
 * (as parameters), check if the use of the corresponding private tile
 * requires unrolling.
 *
 * If we are creating a private tile because we are forced to,
 * then no unrolling is required.
 * Otherwise we check if "access" is bijective and unrolling
 * is required if it is not.  Note that the access relation
 * has already been determined to be bijective before the introduction
 * of the thread identifiers and the removal of the schedule dimensions
 * that are mapped to these threads.  If the access relation is no longer
 * bijective, then this means that more than one value of one of those
 * schedule dimensions is mapped to the same thread and therefore
 * unrolling is required.
 */
static int check_requires_unroll(struct gpu_group_data *data,
	__isl_keep isl_map *access, int force_private)
{
	int bijective;

	if (force_private)
		return 0;
	bijective = access_is_bijective(data, access);
	if (bijective < 0)
		return -1;
	return !bijective;
}

/* Map the domain of "access" to the outer data->shared_depth
 * schedule dimensions.  When data->shared_depth is equal to
 * data->thread_depth, this result is already available in group->access.
 */
static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group,
	__isl_keep isl_union_map *access, struct gpu_group_data *data)
{
	isl_union_map *shared;

	if (data->shared_depth == data->thread_depth)
		return isl_map_copy(group->access);

	shared = isl_union_map_copy(access);
	shared = isl_union_map_apply_domain(shared,
			isl_union_map_copy(data->shared_sched));
	return isl_map_from_union_map(shared);
}

/* Compute the private and/or shared memory tiles for the array
 * reference group "group" of array "array".
 * Return isl_stat_ok on success and isl_stat_error on error.
 *
 * If the array is a read-only scalar or if the user requested
 * not to use shared or private memory, then we do not need to do anything.
 *
 * If any reference in the reference group accesses more than one element,
 * then we would have to make sure that the layout in shared memory
 * is the same as that in global memory.  Since we do not handle this yet
 * (and it may not even be possible), we refuse to map to private or
 * shared memory in such cases.
 *
 * If the array group involves any may writes (that are not must writes),
 * then we would have to make sure that we load the data into shared/private
 * memory first in case the data is not written by the kernel
 * (but still written back out to global memory).
 * Since we don't have any such mechanism at the moment, we don't
 * compute shared/private tiles for groups involving may writes.
 *
 * We only try to compute a shared memory tile if there is any reuse
 * or if the access is not coalesced.
 * Reuse and coalescing are checked within the given kernel.
 *
 * For computing a private memory tile, we also require that there is
 * some reuse.  Moreover, we require that the access is private
 * to the thread.  That is, we check that any given array element
 * is only accessed by a single thread.
 * We compute an access relation that maps the outer
 * data->thread_depth + data->n_thread schedule dimensions.
 * The latter data->n_thread will be mapped to thread identifiers.
 * We actually check that those iterators that will be wrapped
 * partition the array space.  This check is stricter than necessary
 * since several iterations may be mapped onto the same thread
 * and then they could be allowed to access the same memory elements,
 * but our check does not allow this situation.
 *
 * For private memory tiles, the number of schedule dimensions that
 * affect the offset is computed and stored in tile->depth, with
 * a lower bound of data->kernel_depth.  If this depth is smaller
 * than the minimal depth that still ensures that every element
 * is accessed by a single thread, then the depth is raised
 * to this minimal depth.
 * The fields of the tile are then adjusted to only refer to the tile->depth
 * outer schedule dimensions.
 *
 * We also check that the index expression only depends on parallel
 * loops.  That way, we can move those loops innermost and unroll them.
 * Again, we use a test that is stricter than necessary.
 * We actually check whether the index expression only depends
 * on the iterators that are wrapped over the threads.
 * These are necessarily parallel, but there may be more parallel loops.
 *
 * Combining the injectivity of the first test with the single-valuedness
 * of the second test, we simply test for bijectivity.
 *
 * If the use of the private tile requires unrolling, but some
 * of the other arrays are forcibly mapped to private memory,
 * then we do not allow the use of this private tile since
 * we cannot move the schedule dimensions that need to be unrolled down
 * without performing some kind of expansion on those arrays
 * that are forcibly mapped to private memory.
 *
 * If the array is marked force_private, then we bypass all checks
 * and assume we can (and should) use registers only.
 *
 * If it turns out we can (or have to) use registers, we compute
 * the private memory tile size using can_tile, after introducing a dependence
 * on the thread indices.
 */
static isl_stat compute_group_bounds_core(struct ppcg_kernel *kernel,
	struct gpu_array_ref_group *group, struct gpu_group_data *data)
{
	isl_ctx *ctx = isl_space_get_ctx(group->array->space);
	isl_union_map *access, *local;
	int n_index = group->array->n_index;
	int no_reuse, coalesced;
	isl_map *acc;
	int force_private = group->local_array->force_private;
	int use_shared = !force_private && kernel->options->use_shared_memory &&
				data->n_thread > 0;
	int use_private = force_private || kernel->options->use_private_memory;
	isl_stat r = isl_stat_ok;
	isl_bool ok;
	int requires_unroll;
	int unique_depth;

	if (!use_shared && !use_private)
		return isl_stat_ok;
	if (gpu_array_is_read_only_scalar(group->array))
		return isl_stat_ok;
	if (!force_private && !group->exact_write)
		return isl_stat_ok;
	if (group->slice)
		return isl_stat_ok;

	access = gpu_array_ref_group_access_relation(group, 1, 1);
	local = localize_access(data, isl_union_map_copy(access));
	no_reuse = isl_union_map_is_injective(local);
	if (no_reuse < 0)
		r = isl_stat_error;
	if (use_shared && no_reuse)
		coalesced = access_is_coalesced(data, local);
	isl_union_map_free(local);

	if (r >= 0 && kernel->options->debug->verbose &&
	    use_shared && no_reuse && coalesced)
		report_no_reuse_and_coalesced(kernel, access);

	if (use_shared && (!no_reuse || !coalesced)) {
		group->shared_tile = gpu_array_tile_create(ctx,
							group->array->n_index);
		acc = shared_access(group, access, data);
		ok = can_tile(acc, group->shared_tile);
		if (ok < 0)
			r = isl_stat_error;
		else if (!ok)
			group->shared_tile =
					gpu_array_tile_free(group->shared_tile);
		isl_map_free(acc);
	}

	if (r < 0 || (!force_private && (!use_private || no_reuse))) {
		isl_union_map_free(access);
		return r;
	}

	access = isl_union_map_apply_domain(access,
					isl_union_map_copy(data->thread_sched));

	acc = isl_map_from_union_map(access);

	if (!force_private && !access_is_bijective(data, acc)) {
		isl_map_free(acc);
		return isl_stat_ok;
	}

	unique_depth = compute_accessed_by_single_thread_depth(data, acc);

	acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization));
	acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth,
								data->n_thread);
	requires_unroll = check_requires_unroll(data, acc, force_private);
	if (unique_depth < 0 || requires_unroll < 0 ||
	    (requires_unroll && kernel->any_force_private)) {
		isl_map_free(acc);
		return requires_unroll < 0 ? isl_stat_error : isl_stat_ok;
	}

	group->private_tile = gpu_array_tile_create(ctx, n_index);
	group->private_tile->requires_unroll = requires_unroll;
	ok = can_tile(acc, group->private_tile);
	if (ok >= 0 && !ok)
		group->private_tile = gpu_array_tile_free(group->private_tile);
	isl_map_free(acc);
	if (ok < 0)
		return isl_stat_error;

	if (group->private_tile) {
		struct gpu_array_tile *tile = group->private_tile;
		int tile_depth = compute_tile_depth(data, tile);
		if (tile_depth < unique_depth)
			tile_depth = unique_depth;
		if (tile_adjust_depth(tile, tile_depth) < 0)
			return isl_stat_error;
	}

	if (force_private && !group->private_tile)
		isl_die(ctx, isl_error_internal,
			"unable to map array reference group to registers",
			return isl_stat_error);

	return isl_stat_ok;
}

/* Compute the private and/or shared memory tiles for the array
 * reference group "group" of array "array" and set the tile depth.
 * Return 0 on success and -1 on error.
 */
static int compute_group_bounds(struct ppcg_kernel *kernel,
	struct gpu_array_ref_group *group, struct gpu_group_data *data)
{
	if (!group)
		return -1;
	if (compute_group_bounds_core(kernel, group, data) < 0)
		return -1;
	if (set_depth(data, group) < 0)
		return -1;

	return 0;
}

/* If two groups have overlapping access relations (as determined by
 * the "overlap" function) and if one of them involves a write,
 * then merge the two groups into one.
 * If "compute_bounds" is set, then call compute_group_bounds
 * on the merged groups.
 * If any group is merged into the current group, then its access
 * relation may have changed or it may have been turned into a write.
 * The combined group might therefore overlap with groups that
 * the original group did not overlap with.  The groups therefore
 * need to be checked again.
 *
 * Return the updated number of groups.
 * Return -1 on error.
 */
static int group_writes(struct ppcg_kernel *kernel,
	int n, struct gpu_array_ref_group **groups,
	int (*overlap)(struct gpu_array_ref_group *group1,
		struct gpu_array_ref_group *group2), int compute_bounds,
	struct gpu_group_data *data)
{
	int i, j;
	int any_merge;

	for (i = 0; i < n; i += !any_merge) {
		any_merge = 0;
		for (j = n - 1; j > i; --j) {
			if (!groups[i]->write && !groups[j]->write)
				continue;

			if (!overlap(groups[i], groups[j]))
				continue;

			any_merge = 1;
			groups[i] = join_groups_and_free(groups[i], groups[j]);
			if (j != n - 1)
				groups[j] = groups[n - 1];
			groups[n - 1] = NULL;
			n--;

			if (!groups[i])
				return -1;
			if (compute_bounds &&
			    compute_group_bounds(kernel, groups[i], data) < 0)
				return -1;
		}
	}

	return n;
}

/* If two groups have overlapping access relations (within the innermost
 * loop) and if one of them involves a write, then merge the two groups
 * into one.
 *
 * Return the updated number of groups.
 */
static int group_overlapping_writes(struct ppcg_kernel *kernel,
	int n, struct gpu_array_ref_group **groups,
	struct gpu_group_data *data)
{
	return group_writes(kernel, n, groups, &accesses_overlap, 0, data);
}

/* Check if the access relations of group1 and group2 overlap within
 * the outermost min(group1->min_depth, group2->min_depth) loops.
 */
static int depth_accesses_overlap(struct gpu_array_ref_group *group1,
	struct gpu_array_ref_group *group2)
{
	int depth;
	int dim;
	int empty;
	isl_map *map_i, *map_j, *map;

	depth = group1->min_depth;
	if (group2->min_depth < depth)
		depth = group2->min_depth;
	map_i = isl_map_copy(group1->access);
	dim = isl_map_dim(map_i, isl_dim_in);
	map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth);
	map_j = isl_map_copy(group2->access);
	map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth);
	map = isl_map_intersect(map_i, map_j);
	empty = isl_map_is_empty(map);
	isl_map_free(map);

	return !empty;
}

/* If two groups have overlapping access relations (within the outer
 * depth loops) and if one of them involves a write,
 * then merge the two groups into one.
 *
 * Return the updated number of groups.
 */
static int group_depth_overlapping_writes(struct ppcg_kernel *kernel,
	int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data)
{
	return group_writes(kernel, n, groups, &depth_accesses_overlap, 1,
				data);
}

/* Is the size of the tile specified by "tile" smaller than the sum of
 * the sizes of the tiles specified by "tile1" and "tile2"?
 */
static int smaller_tile(struct gpu_array_tile *tile,
	struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
{
	int smaller;
	isl_val *size, *size1, *size2;

	size = gpu_array_tile_size(tile);
	size1 = gpu_array_tile_size(tile1);
	size2 = gpu_array_tile_size(tile2);

	size = isl_val_sub(size, size1);
	size = isl_val_sub(size, size2);
	smaller = isl_val_is_neg(size);

	isl_val_free(size);

	return smaller;
}

/* Given an initial grouping of array references and shared memory tiles
 * for each group that allows for a shared memory tile, merge two groups
 * if both have a shared memory tile, the merged group also has
 * a shared memory tile and the size of the tile for the merge group
 * is smaller than the sum of the tile sizes of the individual groups.
 * If any group is merged into the current group, then it may become
 * profitable to combine it with groups that were considered before
 * the merge.  The groups are therefore checked again after a merge.
 *
 * If merging two groups decreases the depth of the tile of
 * one or both of the two groups, then we need to check for overlapping
 * writes again.
 *
 * Return the number of groups after merging.
 * Return -1 on error.
 */
static int group_common_shared_memory_tile(struct ppcg_kernel *kernel,
	struct gpu_array_info *array, int n,
	struct gpu_array_ref_group **groups, struct gpu_group_data *data)
{
	int i, j;
	int recompute_overlap = 0;
	int any_merge;

	for (i = 0; i < n; i += !any_merge) {
		any_merge = 0;
		if (!groups[i]->shared_tile)
			continue;
		for (j = n - 1; j > i; --j) {
			struct gpu_array_ref_group *group;

			if (!groups[j]->shared_tile)
				continue;

			if (!depth_accesses_overlap(groups[i], groups[j]))
				continue;

			group = join_groups(groups[i], groups[j]);
			if (compute_group_bounds(kernel, group, data) < 0) {
				gpu_array_ref_group_free(group);
				return -1;
			}
			if (!group->shared_tile ||
			    !smaller_tile(group->shared_tile,
					groups[i]->shared_tile,
					groups[j]->shared_tile)) {
				gpu_array_ref_group_free(group);
				continue;
			}

			any_merge = 1;
			if (group->min_depth < groups[i]->min_depth ||
			    group->min_depth < groups[j]->min_depth)
				recompute_overlap = 1;
			gpu_array_ref_group_free(groups[i]);
			gpu_array_ref_group_free(groups[j]);
			groups[i] = group;
			if (j != n - 1)
				groups[j] = groups[n - 1];
			n--;
		}
	}

	if (recompute_overlap)
		n = group_depth_overlapping_writes(kernel, n, groups, data);
	return n;
}

/* Set array->n_group and array->groups to n and groups.
 *
 * Additionally, set the "nr" field of each group.
 */
static void set_array_groups(struct gpu_local_array_info *array,
	int n, struct gpu_array_ref_group **groups)
{
	int i;

	array->n_group = n;
	array->groups = groups;

	for (i = 0; i < n; ++i)
		groups[i]->nr = i;
}

/* Combine all groups in "groups" into a single group and return
 * the new number of groups (1 or 0 if there were no groups to start with).
 */
static int join_all_groups(int n, struct gpu_array_ref_group **groups)
{
	int i;

	for (i = n - 1; i > 0; --i) {
		groups[0] = join_groups_and_free(groups[0], groups[i]);
		groups[i] = NULL;
		n--;
	}

	return n;
}

/* Group array references that should be considered together when
 * deciding whether to access them from private, shared or global memory.
 * Return -1 on error.
 *
 * In particular, if two array references overlap and if one of them
 * is a write, then the two references are grouped together.
 * We first perform an initial grouping based only on the access relation.
 * After computing shared and private memory tiles, we check for
 * overlapping writes again, but this time taking into account
 * the depth of the effective tile.
 *
 * Furthermore, if two groups admit a shared memory tile and if the
 * combination of the two also admits a shared memory tile, we merge
 * the two groups.
 *
 * If the array contains structures, then we compute a single
 * reference group without trying to find any tiles
 * since we do not map such arrays to private or shared
 * memory.  The only exception is when those arrays of structures
 * are required to be mapped to private memory.
 */
static int group_array_references(struct ppcg_kernel *kernel,
	struct gpu_local_array_info *local, struct gpu_group_data *data)
{
	int i;
	int n;
	isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched);
	struct gpu_array_ref_group **groups;

	groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
					local->array->n_ref);
	if (!groups)
		return -1;

	n = populate_array_references(local, groups, data);

	if (local->array->has_compound_element && !local->force_private) {
		n = join_all_groups(n, groups);
		set_array_groups(local, n, groups);
		return 0;
	}

	n = group_overlapping_writes(kernel, n, groups, data);

	for (i = 0; i < n; ++i)
		if (compute_group_bounds(kernel, groups[i], data) < 0)
			n = -1;

	n = group_depth_overlapping_writes(kernel, n, groups, data);

	n = group_common_shared_memory_tile(kernel, local->array,
					    n, groups, data);

	set_array_groups(local, n, groups);

	if (n >= 0)
		return 0;

	for (i = 0; i < local->array->n_ref; ++i)
		gpu_array_ref_group_free(groups[i]);
	return -1;
}

/* For each array in the input program that can be mapped to private memory,
 * check if there are any order dependences active inside the current kernel,
 * within the same iteration of the host schedule, i.e., the prefix
 * schedule at "node".
 * If so, mark the array as force_private so that its reference groups will be
 * mapped to a registers.
 *
 * Note that the arrays that cannot be mapped to private memory have
 * had their order dependences added to prog->array_order and
 * subsequently to the coincidence constraints.
 */
static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel,
	__isl_keep isl_schedule_node *node)
{
	int i;
	isl_union_set *domain;
	isl_multi_union_pw_aff *prefix;
	isl_union_pw_multi_aff *contraction;

	if (!kernel->options->live_range_reordering)
		return;

	kernel->any_force_private = 0;

	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
	prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
								contraction);
	domain = isl_union_set_copy(kernel->expanded_domain);
	domain = isl_union_set_universe(domain);

	for (i = 0; i < kernel->n_array; ++i) {
		struct gpu_local_array_info *local = &kernel->array[i];
		isl_union_map *order;

		local->force_private = 0;
		if (!gpu_array_can_be_private(local->array))
			continue;
		order = isl_union_map_copy(local->array->dep_order);
		order = isl_union_map_intersect_domain(order,
						    isl_union_set_copy(domain));
		order = isl_union_map_intersect_range(order,
						    isl_union_set_copy(domain));
		order = isl_union_map_eq_at_multi_union_pw_aff(order,
					isl_multi_union_pw_aff_copy(prefix));
		if (!isl_union_map_is_empty(order)) {
			local->force_private = 1;
			kernel->any_force_private = 1;
		}
		isl_union_map_free(order);
	}

	isl_multi_union_pw_aff_free(prefix);
	isl_union_set_free(domain);
}

/* Expand the domain of the schedule "s" by plugging in
 * the contraction "contraction" and return the result.
 */
static __isl_give isl_union_map *expand(__isl_take isl_union_map *s,
	__isl_keep isl_union_pw_multi_aff *contraction)
{
	contraction = isl_union_pw_multi_aff_copy(contraction);
	s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction);
	return s;
}

/* Create a set of dimension data->thread_depth + data->n_thread
 * that equates the residue of the final data->n_thread dimensions
 * modulo the kernel->block_dim sizes to the thread identifiers.
 * Store the computed set in data->privatization.
 *
 * The construction starts with the space of kernel->thread_filter,
 * which is known to reference all thread identifiers.
 */
static void compute_privatization(struct gpu_group_data *data,
	struct ppcg_kernel *kernel)
{
	int i;
	isl_ctx *ctx;
	isl_space *space;
	isl_local_space *ls;
	isl_set *set;

	ctx = isl_union_map_get_ctx(data->shared_sched);
	space = isl_union_set_get_space(kernel->thread_filter);
	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set,
				    data->thread_depth + data->n_thread);
	set = isl_set_universe(space);
	space = isl_set_get_space(set);
	ls = isl_local_space_from_space(space);

	for (i = 0; i < data->n_thread; ++i) {
		isl_aff *aff, *aff2;
		isl_constraint *c;
		isl_val *v;
		isl_id *id;
		int pos;

		if (!set)
			break;

		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
					isl_dim_set, data->thread_depth + i);
		v = isl_val_int_from_si(ctx, kernel->block_dim[i]);
		aff = isl_aff_mod_val(aff, v);
		id = isl_id_list_get_id(kernel->thread_ids, i);
		pos = isl_set_find_dim_by_id(set, isl_dim_param, id);
		isl_id_free(id);
		aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls),
					isl_dim_param, pos);
		aff = isl_aff_sub(aff, aff2);
		c = isl_equality_from_aff(aff);
		set = isl_set_add_constraint(set, c);
	}

	isl_local_space_free(ls);
	data->privatization = set;
}

/* Return the prefix schedule at "node" as a relation
 * between domain elements and schedule dimensions after detecting
 * equalities in this relation.
 */
static __isl_give isl_union_map *prefix_with_equalities(
	__isl_keep isl_schedule_node *node)
{
	isl_union_map *schedule;

	schedule = isl_schedule_node_get_prefix_schedule_relation(node);
	schedule = isl_union_map_detect_equalities(schedule);

	return schedule;
}

/* Group references of all arrays in "kernel".
 * "node" points to the kernel mark.
 * The mapping to shared memory in computed at the "shared" mark.
 *
 * We first extract all required schedule information into
 * a gpu_group_data structure and then consider each array
 * in turn.
 */
int gpu_group_references(struct ppcg_kernel *kernel,
	__isl_keep isl_schedule_node *node)
{
	int i;
	int r = 0;
	isl_union_pw_multi_aff *contraction;
	struct gpu_group_data data;

	check_can_be_private_live_ranges(kernel, node);

	data.scop = kernel->prog->scop;

	data.kernel_depth = isl_schedule_node_get_schedule_depth(node);
	data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node);

	node = isl_schedule_node_copy(node);
	node = gpu_tree_move_down_to_shared(node, kernel->core);
	data.shared_depth = isl_schedule_node_get_schedule_depth(node);
	data.shared_sched = prefix_with_equalities(node);

	node = gpu_tree_move_down_to_thread(node, kernel->core);
	node = isl_schedule_node_child(node, 0);
	data.thread_depth = isl_schedule_node_get_schedule_depth(node);
	data.n_thread = isl_schedule_node_band_n_member(node);
	if (data.thread_depth == data.shared_depth)
		data.copy_sched = isl_union_map_copy(data.shared_sched);
	else
		data.copy_sched = prefix_with_equalities(node);
	data.thread_sched = isl_union_map_copy(data.copy_sched);
	data.thread_sched = isl_union_map_flat_range_product(data.thread_sched,
		isl_schedule_node_band_get_partial_schedule_union_map(node));
	data.thread_sched = isl_union_map_detect_equalities(data.thread_sched);

	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
	data.host_sched = expand(data.host_sched, contraction);
	data.shared_sched = expand(data.shared_sched, contraction);
	if (data.thread_depth == data.shared_depth) {
		isl_union_map_free(data.copy_sched);
		data.copy_sched = isl_union_map_copy(data.shared_sched);
	} else {
		data.copy_sched = expand(data.copy_sched, contraction);
	}
	data.thread_sched = expand(data.thread_sched, contraction);
	isl_union_pw_multi_aff_free(contraction);

	node = isl_schedule_node_child(node, 0);
	data.full_sched = isl_union_map_copy(data.thread_sched);
	data.full_sched = isl_union_map_flat_range_product(data.full_sched,
		isl_schedule_node_get_subtree_schedule_union_map(node));
	isl_schedule_node_free(node);

	compute_privatization(&data, kernel);

	for (i = 0; i < kernel->n_array; ++i) {
		r = group_array_references(kernel, &kernel->array[i], &data);
		if (r < 0)
			break;
	}

	isl_union_map_free(data.host_sched);
	isl_union_map_free(data.shared_sched);
	isl_union_map_free(data.copy_sched);
	isl_union_map_free(data.thread_sched);
	isl_union_map_free(data.full_sched);
	isl_set_free(data.privatization);

	return r;
}

/* Given a description of an array tile "tile" and the "space"
 *
 *	{ D -> A }
 *
 * where D represents the first tile->depth schedule dimensions
 * and A represents the array, construct an isl_multi_aff
 *
 *	{ [D[i] -> A[a]] -> A'[a'] }
 *
 * with A' a scaled down copy of A according to the shifts and strides
 * in "tile".  In particular,
 *
 *	a' = (a + shift(i))/stride
 *
 * "insert_array" represents
 *
 *	{ [D -> A] -> D }
 *
 * and is used to insert A into the domain of functions that only
 * reference D.
 */
static __isl_give isl_multi_aff *strided_tile(
	struct gpu_array_tile *tile, __isl_keep isl_space *space,
	__isl_keep isl_multi_aff *insert_array)
{
	int i;
	isl_ctx *ctx;
	isl_multi_aff *shift;
	isl_multi_val *stride;
	isl_space *space2;
	isl_local_space *ls;
	isl_multi_aff *tiling;

	ctx = isl_space_get_ctx(space);
	space2 = isl_space_domain(isl_space_copy(space));
	ls = isl_local_space_from_space(space2);
	space2 = isl_space_range(isl_space_copy(space));
	stride = isl_multi_val_zero(space2);
	shift = isl_multi_aff_zero(isl_space_copy(space));

	for (i = 0; i < tile->n; ++i) {
		struct gpu_array_bound *bound = &tile->bound[i];
		isl_val *stride_i;
		isl_aff *shift_i;

		stride_i = isl_val_copy(bound->stride);
		shift_i = isl_aff_copy(bound->shift);

		stride = isl_multi_val_set_val(stride, i, stride_i);
		shift = isl_multi_aff_set_aff(shift, i, shift_i);
	}
	isl_local_space_free(ls);

	shift = isl_multi_aff_pullback_multi_aff(shift,
				    isl_multi_aff_copy(insert_array));

	tiling = isl_multi_aff_range_map(isl_space_copy(space));
	tiling = isl_multi_aff_add(tiling, shift);
	tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);

	return tiling;
}

/* Compute a tiling for the array reference group "group".
 *
 * The tiling is of the form
 *
 *	{ [D[i] -> A[a]] -> T[t] }
 *
 * where D represents the first tile->depth schedule dimensions,
 * A represents the global array and T represents the shared or
 * private memory tile.  The name of T is the name of the local
 * array.
 *
 * If there is any stride in the accesses, then the mapping is
 *
 *	t = (a + shift(i))/stride - lb(i)
 *
 * otherwise, it is simply
 *
 *	t = a - lb(i)
 */
void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group)
{
	int i;
	struct gpu_array_tile *tile;
	isl_space *space;
	isl_multi_aff *tiling, *lb, *insert_array;
	isl_printer *p;
	char *local_name;

	tile = gpu_array_ref_group_tile(group);
	if (!tile)
		return;

	space = isl_map_get_space(group->access);
	space = isl_space_from_range(isl_space_range(space));
	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
	insert_array = isl_multi_aff_domain_map(isl_space_copy(space));

	for (i = 0; i < tile->n; ++i)
		if (tile->bound[i].shift)
			break;

	if (i < tile->n)
		tiling = strided_tile(tile, space, insert_array);
	else
		tiling = isl_multi_aff_range_map(isl_space_copy(space));

	lb = isl_multi_aff_zero(space);
	for (i = 0; i < tile->n; ++i) {
		isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
		lb = isl_multi_aff_set_aff(lb, i, lb_i);
	}
	lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);

	tiling = isl_multi_aff_sub(tiling, lb);

	p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
	p = gpu_array_ref_group_print_name(group, p);
	local_name = isl_printer_get_str(p);
	isl_printer_free(p);
	tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
	free(local_name);

	tile->tiling = tiling;
}


================================================
FILE: src/ppcg_files/gpu_group.h
================================================
#ifndef GPU_GROUP_H
#define GPU_GROUP_H

#include <isl/schedule_node.h>
#include "gpu.h"

/* A group of array references in a kernel that should be handled together.
 * If private_tile is not NULL, then it is mapped to registers.
 * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
 * Otherwise, it is accessed from global memory.
 * Note that if both private_tile and shared_tile are set, then shared_tile
 * is only used inside group_common_shared_memory_tile.
 */
struct gpu_array_ref_group
{
	/* The references in this group access this local array. */
	struct gpu_local_array_info *local_array;
	/* This is the corresponding array. */
	struct gpu_array_info *array;
	/* Position of this group in the list of reference groups of array. */
	int nr;

	/* The following fields are use during the construction of the groups.
	 * access is the combined access relation relative to the private
	 * memory tiling.  In particular, the domain of the map corresponds
	 * to the first thread_depth dimensions of the kernel schedule.
	 * write is set if any access in the group is a write.
	 * exact_write is set if all writes are definite writes.
	 * slice is set if there is at least one access in the group
	 * that refers to more than one element
	 * "min_depth" is the minimum of the tile depths and thread_depth.
	 */
	isl_map *access;
	int write;
	int exact_write;
	int slice;
	int min_depth;

	/* The shared memory tile, NULL if none. */
	struct gpu_array_tile *shared_tile;

	/* The private memory tile, NULL if none. */
	struct gpu_array_tile *private_tile;

	/* References in this group; point to elements of a linked list. */
	int n_ref;
	struct gpu_stmt_access **refs;
};

int gpu_group_references(struct ppcg_kernel *kernel,
												 __isl_keep isl_schedule_node *node);

__isl_give isl_printer *gpu_array_ref_group_print_name(
		struct gpu_array_ref_group *group, __isl_take isl_printer *p);
void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group);
__isl_give isl_union_map *gpu_array_ref_group_access_relation(
		struct gpu_array_ref_group *group, int read, int write);
int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
enum ppcg_group_access_type gpu_array_ref_group_type(
		struct gpu_array_ref_group *group);
struct gpu_array_tile *gpu_array_ref_group_tile(
		struct gpu_array_ref_group *group);
struct gpu_array_ref_group *gpu_array_ref_group_free(
		struct gpu_array_ref_group *group);

#endif


================================================
FILE: src/ppcg_files/gpu_hybrid.c
================================================
/*
 * Copyright 2013      Ecole Normale Superieure
 * Copyright 2015      Sven Verdoolaege
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <string.h>

#include <isl/val.h>
#include <isl/space.h>
#include <isl/union_set.h>
#include <isl/schedule_node.h>

#include "hybrid.h"
#include "gpu_hybrid.h"
#include "gpu_tree.h"
#include "schedule.h"
#include "util.h"

/* Have all domain elements been filtered out before reaching
 * the "node" position in the schedule tree?
 */
static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
{
	isl_union_set *domain;
	isl_bool empty;

	domain = isl_schedule_node_get_domain(node);
	empty = isl_union_set_is_empty(domain);
	isl_union_set_free(domain);

	return empty;
}

/* Given a pointer to a phase in the result of hybrid tiling,
 * map the phase to the device, provided the phase is non-empty.
 * Empty phases can occur if the input schedule domain can be
 * covered by a small number of hexagons that all belong to the same phase.
 *
 * The input has the following form:
 *
 *	M - CT - P - C - ...
 *
 * with M the phase marker, CT the space tiling, P the original
 * parent band and C the original child band.
 * The (outer dimensions of the) C band need to be mapped to threads.
 * The (outer dimension of the) CT band needs to be mapped to blocks.
 * The mapping to shared memory needs to be computed between the CT and
 * the P band.
 *
 * The C band is first shifted to start at zero.
 * Then the appropriate markers are introduced and a kernel is
 * created for the tree rooted at CT.
 * If the "unroll_gpu_tile" option is set, then the AST generator
 * is instructed to unroll the P and C bands.
 */
static __isl_give isl_schedule_node *update_phase(
	__isl_take isl_schedule_node *node, void *user)
{
	struct gpu_gen *gen = user;
	int depth0, depth;
	isl_ctx *ctx;
	isl_id *id;
	isl_bool empty_domain;
	ppcg_ht_phase *phase;

	empty_domain = has_empty_domain(node);
	if (empty_domain < 0)
		return isl_schedule_node_free(node);
	if (empty_domain)
		return node;

	if (!node)
		return NULL;
	ctx = isl_schedule_node_get_ctx(node);

	phase = ppcg_ht_phase_extract_from_mark(node);

	depth0 = isl_schedule_node_get_tree_depth(node);

	node = isl_schedule_node_child(node, 0);

	node = isl_schedule_node_child(node, 0);
	node = isl_schedule_node_child(node, 0);
	node = ppcg_ht_phase_shift_space_point(phase, node);
	if (gen->options->unroll_gpu_tile)
		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
	id = isl_id_alloc(ctx, "thread", NULL);
	node = isl_schedule_node_insert_mark(node, id);
	node = isl_schedule_node_parent(node);
	if (gen->options->unroll_gpu_tile)
		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
	id = isl_id_alloc(ctx, "shared", NULL);
	node = isl_schedule_node_insert_mark(node, id);
	node = isl_schedule_node_parent(node);

	node = gpu_create_kernel(gen, node, 0, NULL);

	depth = isl_schedule_node_get_tree_depth(node);
	node = isl_schedule_node_ancestor(node, depth - depth0);

	return node;
}

/* Apply hybrid tiling on "node" and its parent based on the (valid)
 * bounds on the relative dependence distances "bounds" and
 * the tile sizes in "tile_sizes".
 * The number of elements in "tile_sizes" is at least as large
 * as the sum of the dimensions of the parent and the child node.
 *
 * Convert the tile_sizes to an isl_multi_val in the right space,
 * insert the hybrid tiling and then create a kernel inside each phase.
 * Finally, remove the phase marks.
 */
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
	int *tile_sizes)
{
	isl_multi_val *mv;
	isl_space *space, *space2;

	if (!node || !bounds)
		goto error;

	space2 = isl_schedule_node_band_get_space(node);
	node = isl_schedule_node_parent(node);
	space = isl_schedule_node_band_get_space(node);
	space = isl_space_product(space, space2);
	mv = ppcg_multi_val_from_int_list(space, tile_sizes);

	node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);

	node = hybrid_tile_foreach_phase(node, &update_phase, gen);

	node = hybrid_tile_drop_phase_marks(node);

	return node;
error:
	isl_schedule_node_free(node);
	ppcg_ht_bounds_free(bounds);
	return NULL;
}


================================================
FILE: src/ppcg_files/gpu_hybrid.h
================================================
#ifndef GPU_HYBRID_H
#define GPU_HYBRID_H

#include <isl/schedule_node.h>

#include "gpu.h"
#include "hybrid.h"

__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
																							__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
																							int *tile_sizes);

#endif


================================================
FILE: src/ppcg_files/gpu_print.c
================================================
/*
 * Copyright 2012      Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
 */

#include <string.h>

#include <isl/aff.h>

#include "gpu_print.h"
#include "print.h"
#include "schedule.h"

/* Print declarations to "p" for arrays that are local to "prog"
 * but that are used on the host and therefore require a declaration.
 */
__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	int i;

	if (!prog)
		return isl_printer_free(p);

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		isl_ast_expr *size;

		if (!array->declare_local)
			continue;
		size = array->declared_size;
		p = ppcg_print_declaration_with_size(p, array->type, size);
	}

	return p;
}

/* Print an expression for the size of "array" in bytes.
 */
__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
	struct gpu_array_info *array)
{
	int i;

	for (i = 0; i < array->n_index; ++i) {
		isl_ast_expr *bound;

		prn = isl_printer_print_str(prn, "(");
		bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
		prn = isl_printer_print_ast_expr(prn, bound);
		isl_ast_expr_free(bound);
		prn = isl_printer_print_str(prn, ") * ");
	}
	prn = isl_printer_print_str(prn, "sizeof(");
	prn = isl_printer_print_str(prn, array->type);
	prn = isl_printer_print_str(prn, ")");

	return prn;
}

/* Print the declaration of a non-linearized array argument.
 */
static __isl_give isl_printer *print_non_linearized_declaration_argument(
	__isl_take isl_printer *p, struct gpu_array_info *array)
{
	p = isl_printer_print_str(p, array->type);
	p = isl_printer_print_str(p, " ");

	p = isl_printer_print_ast_expr(p, array->bound_expr);

	return p;
}

/* Print the declaration of an array argument.
 * "memory_space" allows to specify a memory space prefix.
 */
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
	__isl_take isl_printer *p, struct gpu_array_info *array,
	const char *memory_space)
{
	if (gpu_array_is_read_only_scalar(array)) {
		p = isl_printer_print_str(p, array->type);
		p = isl_printer_print_str(p, " ");
		p = isl_printer_print_str(p, array->name);
		return p;
	}

	if (memory_space) {
		p = isl_printer_print_str(p, memory_space);
		p = isl_printer_print_str(p, " ");
	}

	if (array->n_index != 0 && !array->linearize)
		return print_non_linearized_declaration_argument(p, array);

	p = isl_printer_print_str(p, array->type);
	p = isl_printer_print_str(p, " ");
	p = isl_printer_print_str(p, "*");
	p = isl_printer_print_str(p, array->name);

	return p;
}

/* Print the call of an array argument.
 */
__isl_give isl_printer *gpu_array_info_print_call_argument(
	__isl_take isl_printer *p, struct gpu_array_info *array)
{
	if (gpu_array_is_read_only_scalar(array))
		return isl_printer_print_str(p, array->name);

	p = isl_printer_print_str(p, "dev_");
	p = isl_printer_print_str(p, array->name);

	return p;
}

/* Print an access to the element in the private/shared memory copy
 * described by "stmt".  The index of the copy is recorded in
 * stmt->local_index as an access to the array.
 */
static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	return isl_printer_print_ast_expr(p, stmt->u.c.local_index);
}

/* Print an access to the element in the global memory copy
 * described by "stmt".  The index of the copy is recorded in
 * stmt->index as an access to the array.
 */
static __isl_give isl_printer *stmt_print_global_index(
	__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
{
	struct gpu_array_info *array = stmt->u.c.array;
	isl_ast_expr *index;

	if (gpu_array_is_scalar(array)) {
		if (!gpu_array_is_read_only_scalar(array))
			p = isl_printer_print_str(p, "*");
		p = isl_printer_print_str(p, array->name);
		return p;
	}

	index = isl_ast_expr_copy(stmt->u.c.index);

	p = isl_printer_print_ast_expr(p, index);
	isl_ast_expr_free(index);

	return p;
}

/* Print a copy statement.
 *
 * A read copy statement is printed as
 *
 *	local = global;
 *
 * while a write copy statement is printed as
 *
 *	global = local;
 */
__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	p = isl_printer_start_line(p);
	if (stmt->u.c.read) {
		p = stmt_print_local_index(p, stmt);
		p = isl_printer_print_str(p, " = ");
		p = stmt_print_global_index(p, stmt);
	} else {
		p = stmt_print_global_index(p, stmt);
		p = isl_printer_print_str(p, " = ");
		p = stmt_print_local_index(p, stmt);
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
}

/* This function is called for each node in a GPU AST.
 * In case of a user node, print the macro definitions required
 * for printing the AST expressions in the annotation, if any.
 * For other nodes, return true such that descendants are also
 * visited.
 *
 * In particular, for a kernel launch, print the macro definitions
 * needed for the grid size.
 * For a copy statement, print the macro definitions needed
 * for the two index expressions.
 * For an original user statement, print the macro definitions
 * needed for the substitutions.
 */
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
{
	const char *name;
	isl_id *id;
	int is_kernel;
	struct ppcg_kernel *kernel;
	struct ppcg_kernel_stmt *stmt;
	isl_printer **p = user;

	if (isl_ast_node_get_type(node) != isl_ast_node_user)
		return isl_bool_true;

	id = isl_ast_node_get_annotation(node);
	if (!id)
		return isl_bool_false;

	name = isl_id_get_name(id);
	if (!name)
		return isl_bool_error;
	is_kernel = !strcmp(name, "kernel");
	kernel = is_kernel ? isl_id_get_user(id) : NULL;
	stmt = is_kernel ? NULL : isl_id_get_user(id);
	isl_id_free(id);

	if ((is_kernel && !kernel) || (!is_kernel && !stmt))
		return isl_bool_error;

	if (is_kernel) {
		*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
	} else if (stmt->type == ppcg_kernel_copy) {
		*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
		*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
	} else if (stmt->type == ppcg_kernel_domain) {
		*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
	}
	if (!*p)
		return isl_bool_error;

	return isl_bool_false;
}

/* Print the required macros for the GPU AST "node" to "p",
 * including those needed for the user statements inside the AST.
 */
__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node)
{
	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
		return isl_printer_free(p);
	p = ppcg_print_macros(p, node);
	return p;
}

/* Was the definition of "type" printed before?
 * That is, does its name appear in the list of printed types "types"?
 */
static int already_printed(struct gpu_types *types,
	struct pet_type *type)
{
	int i;

	for (i = 0; i < types->n; ++i)
		if (!strcmp(types->name[i], type->name))
			return 1;

	return 0;
}

/* Print the definitions of all types prog->scop that have not been
 * printed before (according to "types") on "p".
 * Extend the list of printed types "types" with the newly printed types.
 */
__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
	struct gpu_types *types, struct gpu_prog *prog)
{
	int i, n;
	isl_ctx *ctx;
	char **name;

	n = prog->scop->pet->n_type;

	if (n == 0)
		return p;

	ctx = isl_printer_get_ctx(p);
	name = isl_realloc_array(ctx, types->name, char *, types->n + n);
	if (!name)
		return isl_printer_free(p);
	types->name = name;

	for (i = 0; i < n; ++i) {
		struct pet_type *type = prog->scop->pet->types[i];

		if (already_printed(types, type))
			continue;

		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, type->definition);
		p = isl_printer_print_str(p, ";");
		p = isl_printer_end_line(p);

		types->name[types->n++] = strdup(type->name);
	}

	return p;
}


================================================
FILE: src/ppcg_files/gpu_print.h
================================================
#ifndef GPU_PRINT_H
#define GPU_PRINT_H

#include "gpu.h"

__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
																										 struct gpu_prog *prog);

__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
																				struct gpu_types *types, struct gpu_prog *prog);

__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
																				 __isl_keep isl_ast_node *node);

__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
																									struct gpu_array_info *array);
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
		__isl_take isl_printer *p, struct gpu_array_info *array,
		const char *memory_space);
__isl_give isl_printer *gpu_array_info_print_call_argument(
		__isl_take isl_printer *p, struct gpu_array_info *array);

__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
																							 struct ppcg_kernel_stmt *stmt);
__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
																								 struct ppcg_kernel_stmt *stmt);

#endif


================================================
FILE: src/ppcg_files/gpu_tree.c
================================================
/*
 * Copyright 2013      Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <string.h>

#include <isl/space.h>
#include <isl/set.h>
#include <isl/union_set.h>

#include "gpu_tree.h"

/* The functions in this file are used to navigate part of a schedule tree
 * that is mapped to blocks.  Initially, this part consists of a linear
 * branch segment with a mark node with name "kernel" on the outer end
 * and a mark node with name "thread" on the inner end.
 * During the mapping to blocks, branching may be introduced, but only
 * one of the elements in each sequence contains the "thread" mark.
 * The filter of this element (and only this filter) contains
 * domain elements identified by the "core" argument of the functions
 * that move down this tree.
 *
 * Synchronization statements have a name that starts with "sync" and
 * a user pointer pointing to the kernel that contains the synchronization.
 * The functions inserting or detecting synchronizations take a ppcg_kernel
 * argument to be able to create or identify such statements.
 * They may also use two fields in this structure, the "core" field
 * to move around in the tree and the "n_sync" field to make sure that
 * each synchronization has a different name (within the kernel).
 */

/* Is "node" a mark node with an identifier called "name"?
 */
static int is_marked(__isl_keep isl_schedule_node *node, const char *name)
{
	isl_id *mark;
	int has_name;

	if (!node)
		return -1;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
		return 0;

	mark = isl_schedule_node_mark_get_id(node);
	if (!mark)
		return -1;

	has_name = !strcmp(isl_id_get_name(mark), name);
	isl_id_free(mark);

	return has_name;
}

/* Is "node" a mark node with an identifier called "kernel"?
 */
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
{
	return is_marked(node, "kernel");
}

/* Is "node" a mark node with an identifier called "shared"?
 */
static int node_is_shared(__isl_keep isl_schedule_node *node)
{
	return is_marked(node, "shared");
}

/* Is "node" a mark node with an identifier called "thread"?
 */
static int node_is_thread(__isl_keep isl_schedule_node *node)
{
	return is_marked(node, "thread");
}

/* Insert a mark node with identifier "shared" in front of "node".
 */
static __isl_give isl_schedule_node *insert_shared(
	__isl_take isl_schedule_node *node)
{
	isl_ctx *ctx;
	isl_id *id;

	ctx = isl_schedule_node_get_ctx(node);
	id = isl_id_alloc(ctx, "shared", NULL);
	node = isl_schedule_node_insert_mark(node, id);

	return node;
}

/* Insert a "shared" mark in front of the "thread" mark
 * provided the linear branch between "node" and the "thread" mark
 * does not contain such a "shared" mark already.
 *
 * As a side effect, this function checks that the subtree at "node"
 * actually contains a "thread" mark and that there is no branching
 * in between "node" and this "thread" mark.
 */
__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
	__isl_take isl_schedule_node *node)
{
	int depth0, depth;
	int any_shared = 0;

	if (!node)
		return NULL;

	depth0 = isl_schedule_node_get_tree_depth(node);

	for (;;) {
		int is_thread;
		int n;

		if (!any_shared) {
			any_shared = node_is_shared(node);
			if (any_shared < 0)
				return isl_schedule_node_free(node);
		}
		is_thread = node_is_thread(node);
		if (is_thread < 0)
			return isl_schedule_node_free(node);
		if (is_thread)
			break;
		n = isl_schedule_node_n_children(node);
		if (n == 0)
			isl_die(isl_schedule_node_get_ctx(node),
				isl_error_invalid,
				"no thread marker found",
				return isl_schedule_node_free(node));
		if (n > 1)
			isl_die(isl_schedule_node_get_ctx(node),
				isl_error_invalid,
				"expecting single thread marker",
				return isl_schedule_node_free(node));

		node = isl_schedule_node_child(node, 0);
	}

	if (!any_shared)
		node = insert_shared(node);
	depth = isl_schedule_node_get_tree_depth(node);
	node = isl_schedule_node_ancestor(node, depth - depth0);

	return node;
}

/* Assuming "node" is a filter node, does it correspond to the branch
 * that contains the "thread" mark, i.e., does it contain any elements
 * in "core"?
 */
static int node_is_core(__isl_keep isl_schedule_node *node,
	__isl_keep isl_union_set *core)
{
	int disjoint;
	isl_union_set *filter;

	filter = isl_schedule_node_filter_get_filter(node);
	disjoint = isl_union_set_is_disjoint(filter, core);
	isl_union_set_free(filter);
	if (disjoint < 0)
		return -1;

	return !disjoint;
}

/* Move to the only child of "node" that has the "thread" mark as descendant,
 * where the branch containing this mark is identified by the domain elements
 * in "core".
 *
 * If "node" is not a sequence, then it only has one child and we move
 * to that single child.
 * Otherwise, we check each of the filters in the children, pick
 * the one that corresponds to "core" and return a pointer to the child
 * of the filter node.
 */
static __isl_give isl_schedule_node *core_child(
	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
	int i, n;

	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
		return isl_schedule_node_child(node, 0);

	n = isl_schedule_node_n_children(node);
	for (i = 0; i < n; ++i) {
		int is_core;

		node = isl_schedule_node_child(node, i);
		is_core = node_is_core(node, core);

		if (is_core < 0)
			return isl_schedule_node_free(node);
		if (is_core)
			return isl_schedule_node_child(node, 0);

		node = isl_schedule_node_parent(node);
	}

	isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
		"core child not found", return isl_schedule_node_free(node));
}

/* Move down the branch between "kernel" and "thread" until
 * the "shared" mark is reached, where the branch containing the "shared"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
	int is_shared;

	while ((is_shared = node_is_shared(node)) == 0)
		node = core_child(node, core);
	if (is_shared < 0)
		node = isl_schedule_node_free(node);

	return node;
}

/* Move down the branch between "kernel" and "thread" until
 * the "thread" mark is reached, where the branch containing the "thread"
 * mark is identified by the domain elements in "core".
 */
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
	int is_thread;

	while ((is_thread = node_is_thread(node)) == 0)
		node = core_child(node, core);
	if (is_thread < 0)
		node = isl_schedule_node_free(node);

	return node;
}

/* Move up the tree underneath the "thread" mark until
 * the "thread" mark is reached.
 */
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
	__isl_take isl_schedule_node *node)
{
	int is_thread;

	while ((is_thread = node_is_thread(node)) == 0)
		node = isl_schedule_node_parent(node);
	if (is_thread < 0)
		node = isl_schedule_node_free(node);

	return node;
}

/* Move up the tree underneath the "kernel" mark until
 * the "kernel" mark is reached.
 */
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
	__isl_take isl_schedule_node *node)
{
	int is_kernel;

	while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0)
		node = isl_schedule_node_parent(node);
	if (is_kernel < 0)
		node = isl_schedule_node_free(node);

	return node;
}

/* Move down from the "kernel" mark (or at least a node with schedule
 * depth smaller than or equal to "depth") to a band node at schedule
 * depth "depth".  The "thread" mark is assumed to have a schedule
 * depth greater than or equal to "depth".  The branch containing the
 * "thread" mark is identified by the domain elements in "core".
 *
 * If the desired schedule depth is in the middle of band node,
 * then the band node is split into two pieces, the second piece
 * at the desired schedule depth.
 */
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
	__isl_take isl_schedule_node *node, int depth,
	__isl_keep isl_union_set *core)
{
	int is_shared;
	int is_thread = 0;

	while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
		if (isl_schedule_node_get_type(node) ==
						    isl_schedule_node_band) {
			int node_depth, node_dim;
			node_depth = isl_schedule_node_get_schedule_depth(node);
			node_dim = isl_schedule_node_band_n_member(node);
			if (node_depth + node_dim > depth)
				node = isl_schedule_node_band_split(node,
							depth - node_depth);
		}
		node = core_child(node, core);
	}
	while ((is_shared = node_is_shared(node)) == 0 &&
	    (is_thread = node_is_thread(node)) == 0 &&
	    isl_schedule_node_get_type(node) != isl_schedule_node_band)
		node = core_child(node, core);
	if (is_shared < 0 || is_thread < 0)
		node = isl_schedule_node_free(node);

	return node;
}

/* Create a union set containing a single set with a tuple identifier
 * called "syncX" and user pointer equal to "kernel".
 */
static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel)
{
	isl_space *space;
	isl_id *id;
	char name[40];

	space = isl_space_set_alloc(kernel->ctx, 0, 0);
	snprintf(name, sizeof(name), "sync%d", kernel->n_sync++);
	id = isl_id_alloc(kernel->ctx, name, kernel);
	space = isl_space_set_tuple_id(space, isl_dim_set, id);
	return isl_union_set_from_set(isl_set_universe(space));
}

/* Is "id" the identifier of a synchronization statement inside "kernel"?
 * That is, does its name start with "sync" and does it point to "kernel"?
 */
int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel)
{
	const char *name;

	name = isl_id_get_name(id);
	if (!name)
		return 0;
	else if (strncmp(name, "sync", 4))
		return 0;
	return isl_id_get_user(id) == kernel;
}

/* Does "domain" consist of a single set with a tuple identifier
 * corresponding to a synchronization for "kernel"?
 */
static int domain_is_sync(__isl_keep isl_union_set *domain,
	struct ppcg_kernel *kernel)
{
	int is_sync;
	isl_id *id;
	isl_set *set;

	if (isl_union_set_n_set(domain) != 1)
		return 0;
	set = isl_set_from_union_set(isl_union_set_copy(domain));
	id = isl_set_get_tuple_id(set);
	is_sync = gpu_tree_id_is_sync(id, kernel);
	isl_id_free(id);
	isl_set_free(set);

	return is_sync;
}

/* Does "node" point to a filter selecting a synchronization statement
 * for "kernel"?
 */
static int node_is_sync_filter(__isl_keep isl_schedule_node *node,
	struct ppcg_kernel *kernel)
{
	int is_sync;
	enum isl_schedule_node_type type;
	isl_union_set *domain;

	if (!node)
		return -1;
	type = isl_schedule_node_get_type(node);
	if (type != isl_schedule_node_filter)
		return 0;
	domain = isl_schedule_node_filter_get_filter(node);
	is_sync = domain_is_sync(domain, kernel);
	isl_union_set_free(domain);

	return is_sync;
}

/* Is "node" part of a sequence with a previous synchronization statement
 * for "kernel"?
 * That is, is the parent of "node" a filter such that there is
 * a previous filter that picks out exactly such a synchronization statement?
 */
static int has_preceding_sync(__isl_keep isl_schedule_node *node,
	struct ppcg_kernel *kernel)
{
	int found = 0;

	node = isl_schedule_node_copy(node);
	node = isl_schedule_node_parent(node);
	while (!found && isl_schedule_node_has_previous_sibling(node)) {
		node = isl_schedule_node_previous_sibling(node);
		if (!node)
			break;
		found = node_is_sync_filter(node, kernel);
	}
	if (!node)
		found = -1;
	isl_schedule_node_free(node);

	return found;
}

/* Is "node" part of a sequence with a subsequent synchronization statement
 * for "kernel"?
 * That is, is the parent of "node" a filter such that there is
 * a subsequent filter that picks out exactly such a synchronization statement?
 */
static int has_following_sync(__isl_keep isl_schedule_node *node,
	struct ppcg_kernel *kernel)
{
	int found = 0;

	node = isl_schedule_node_copy(node);
	node = isl_schedule_node_parent(node);
	while (!found && isl_schedule_node_has_next_sibling(node)) {
		node = isl_schedule_node_next_sibling(node);
		if (!node)
			break;
		found = node_is_sync_filter(node, kernel);
	}
	if (!node)
		found = -1;
	isl_schedule_node_free(node);

	return found;
}

/* Does the subtree rooted at "node" (which is a band node) contain
 * any synchronization statement for "kernel" that precedes
 * the core computation of "kernel" (identified by the elements
 * in kernel->core)?
 */
static int has_sync_before_core(__isl_keep isl_schedule_node *node,
	struct ppcg_kernel *kernel)
{
	int has_sync = 0;
	int is_thread;

	node = isl_schedule_node_copy(node);
	while ((is_thread = node_is_thread(node)) == 0) {
		node = core_child(node, kernel->core);
		has_sync = has_preceding_sync(node, kernel);
		if (has_sync < 0 || has_sync)
			break;
	}
	if (is_thread < 0 || !node)
		has_sync = -1;
	isl_schedule_node_free(node);

	return has_sync;
}

/* Does the subtree rooted at "node" (which is a band node) contain
 * any synchronization statement for "kernel" that follows
 * the core computation of "kernel" (identified by the elements
 * in kernel->core)?
 */
static int has_sync_after_core(__isl_keep isl_schedule_node *node,
	struct ppcg_kernel *kernel)
{
	int has_sync = 0;
	int is_thread;

	node = isl_schedule_node_copy(node);
	while ((is_thread = node_is_thread(node)) == 0) {
		node = core_child(node, kernel->core);
		has_sync = has_following_sync(node, kernel);
		if (has_sync < 0 || has_sync)
			break;
	}
	if (is_thread < 0 || !node)
		has_sync = -1;
	isl_schedule_node_free(node);

	return has_sync;
}

/* Insert (or extend) an extension on top of "node" that puts
 * a synchronization node for "kernel" before "node".
 * Return a pointer to the original node in the updated schedule tree.
 */
static __isl_give isl_schedule_node *insert_sync_before(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	isl_union_set *domain;
	isl_schedule_node *graft;

	if (!node)
		return NULL;

	domain = create_sync_domain(kernel);
	graft = isl_schedule_node_from_domain(domain);
	node = isl_schedule_node_graft_before(node, graft);

	return node;
}

/* Insert (or extend) an extension on top of "node" that puts
 * a synchronization node for "kernel" afater "node".
 * Return a pointer to the original node in the updated schedule tree.
 */
static __isl_give isl_schedule_node *insert_sync_after(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	isl_union_set *domain;
	isl_schedule_node *graft;

	if (!node)
		return NULL;

	domain = create_sync_domain(kernel);
	graft = isl_schedule_node_from_domain(domain);
	node = isl_schedule_node_graft_after(node, graft);

	return node;
}

/* Insert an extension on top of "node" that puts a synchronization node
 * for "kernel" before "node" unless there already is
 * such a synchronization node.
 */
__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	int has_sync;

	has_sync = has_preceding_sync(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	return insert_sync_before(node, kernel);
}

/* Insert an extension on top of "node" that puts a synchronization node
 * for "kernel" after "node" unless there already is
 * such a synchronization node.
 */
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	int has_sync;

	has_sync = has_following_sync(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	return insert_sync_after(node, kernel);
}

/* Insert an extension on top of "node" that puts a synchronization node
 * for "kernel" after "node" unless there already is such a sync node or
 * "node" itself already * contains a synchronization node following
 * the core computation of "kernel".
 */
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	int has_sync;

	has_sync = has_sync_after_core(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	has_sync = has_following_sync(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	return insert_sync_after(node, kernel);
}

/* Move left in the sequence on top of "node" to a synchronization node
 * for "kernel".
 * If "node" itself contains a synchronization node preceding
 * the core computation of "kernel", then return "node" itself.
 * Otherwise, if "node" does not have a preceding synchronization node,
 * then create one first.
 */
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	int has_sync;
	int is_sync;

	has_sync = has_sync_before_core(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	node = gpu_tree_ensure_preceding_sync(node, kernel);
	node = isl_schedule_node_parent(node);
	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
		node = isl_schedule_node_previous_sibling(node);
	if (is_sync < 0)
		node = isl_schedule_node_free(node);
	node = isl_schedule_node_child(node, 0);

	return node;
}

/* Move right in the sequence on top of "node" to a synchronization node
 * for "kernel".
 * If "node" itself contains a synchronization node following
 * the core computation of "kernel", then return "node" itself.
 * Otherwise, if "node" does not have a following synchronization node,
 * then create one first.
 */
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
	int has_sync;
	int is_sync;

	has_sync = has_sync_after_core(node, kernel);
	if (has_sync < 0)
		return isl_schedule_node_free(node);
	if (has_sync)
		return node;
	node = gpu_tree_ensure_following_sync(node, kernel);
	node = isl_schedule_node_parent(node);
	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
		node = isl_schedule_node_next_sibling(node);
	if (is_sync < 0)
		node = isl_schedule_node_free(node);
	node = isl_schedule_node_child(node, 0);

	return node;
}


================================================
FILE: src/ppcg_files/gpu_tree.h
================================================
#ifndef GPU_TREE_H
#define GPU_TREE_H

#include <isl/schedule_node.h>

#include "gpu.h"

__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
		__isl_take isl_schedule_node *node);
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
		__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
		__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
		__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
		__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
		__isl_take isl_schedule_node *node, int depth,
		__isl_keep isl_union_set *core);

int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
		__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
		__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
		__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
		__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);

#endif


================================================
FILE: src/ppcg_files/opencl.c
================================================
/*
 * Copyright 2013      Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege and Riyadh Baghdadi,
 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
 */

#include <ctype.h>
#include <limits.h>
#include <string.h>

#include <isl/aff.h>
#include <isl/ast.h>

#include "opencl.h"
#include "gpu_print.h"
#include "gpu.h"
#include "ppcg.h"
#include "print.h"
#include "schedule.h"
#include "util.h"

#define min(a, b)  (((a) < (b)) ? (a) : (b))
#define max(a, b)  (((a) > (b)) ? (a) : (b))

/* options are the global options passed to generate_opencl.
 * input is the name of the input file.
 * output is the user-specified output file name and may be NULL
 *	if not specified by the user.
 * kernel_c_name is the name of the kernel_c file.
 * kprinter is an isl_printer for the kernel file.
 * host_c is the generated source file for the host code.  kernel_c is
 * the generated source file for the kernel.
 */
struct opencl_info {
	struct ppcg_options *options;
	const char *input;
	const char *output;
	char kernel_c_name[PATH_MAX];

	isl_printer *kprinter;

	FILE *host_c;
	FILE *kernel_c;
};

/* Open the file called "name" for writing or print an error message.
 */
static FILE *open_or_croak(const char *name)
{
	FILE *file;

	file = fopen(name, "w");
	if (!file)
		fprintf(stderr, "Failed to open \"%s\" for writing\n", name);
	return file;
}

/* Open the host .c file and the kernel .h and .cl files for writing.
 * Their names are derived from info->output (or info->input if
 * the user did not specify an output file name).
 * Add the necessary includes to these files, including those specified
 * by the user.
 *
 * Return 0 on success and -1 on failure.
 */
static int opencl_open_files(struct opencl_info *info)
{
	char name[PATH_MAX];
	int i;
	int len;

	if (info->output) {
		const char *ext;

		ext = strrchr(info->output, '.');
		len = ext ? ext - info->output : strlen(info->output);
		memcpy(name, info->output, len);

		info->host_c = open_or_croak(info->output);
	} else {
		len = ppcg_extract_base_name(name, info->input);

		strcpy(name + len, "_host.c");
		info->host_c = open_or_croak(name);
	}

	memcpy(info->kernel_c_name, name, len);
	strcpy(info->kernel_c_name + len, "_kernel.cl");
	info->kernel_c = open_or_croak(info->kernel_c_name);

	if (!info->host_c || !info->kernel_c)
		return -1;

	fprintf(info->host_c, "#include <assert.h>\n");
	fprintf(info->host_c, "#include <stdio.h>\n");
	fprintf(info->host_c, "#include \"ocl_utilities.h\"\n");
	if (info->options->opencl_embed_kernel_code) {
		fprintf(info->host_c, "#include \"%s\"\n\n",
			info->kernel_c_name);
	}

	for (i = 0; i < info->options->opencl_n_include_file; ++i) {
		info->kprinter = isl_printer_print_str(info->kprinter,
					"#include <");
		info->kprinter = isl_printer_print_str(info->kprinter,
					info->options->opencl_include_files[i]);
		info->kprinter = isl_printer_print_str(info->kprinter, ">\n");
	}

	return 0;
}

/* Write text to a file and escape some special characters that would break a
 * C string.
 */
static void opencl_print_escaped(const char *str, const char *end, FILE *file)
{
	const char *prev = str;

	while ((str = strpbrk(prev, "\"\\")) && str < end) {
		fwrite(prev, 1, str - prev, file);
		fprintf(file, "\\%c", *str);
		prev = str + 1;
	}

	if (*prev)
		fwrite(prev, 1, end - prev, file);
}

/* Write text to a file as a C string literal.
 *
 * This function also prints any characters after the last newline, although
 * normally the input string should end with a newline.
 */
static void opencl_print_as_c_string(const char *str, FILE *file)
{
	const char *prev = str;

	while ((str = strchr(prev, '\n'))) {
		fprintf(file, "\n\"");
		opencl_print_escaped(prev, str, file);
		fprintf(file, "\\n\"");

		prev = str + 1;
	}

	if (*prev) {
		fprintf(file, "\n\"");
		opencl_print_escaped(prev, prev + strlen(prev), file);
		fprintf(file, "\"");
	}
}

/* Write the code that we have accumulated in the kernel isl_printer to the
 * kernel.cl file.  If the opencl_embed_kernel_code option has been set, print
 * the code as a C string literal.  Start that string literal with an empty
 * line, such that line numbers reported by the OpenCL C compiler match those
 * of the kernel file.
 *
 * Return 0 on success and -1 on failure.
 */
static int opencl_write_kernel_file(struct opencl_info *opencl)
{
	char *raw = isl_printer_get_str(opencl->kprinter);

	if (!raw)
		return -1;

	if (opencl->options->opencl_embed_kernel_code) {
		fprintf(opencl->kernel_c,
			"static const char kernel_code[] = \"\\n\"");
		opencl_print_as_c_string(raw, opencl->kernel_c);
		fprintf(opencl->kernel_c, ";\n");
	} else
		fprintf(opencl->kernel_c, "%s", raw);

	free(raw);

	return 0;
}

/* Close all output files.  Write the kernel contents to the kernel file before
 * closing it.
 *
 * Return 0 on success and -1 on failure.
 */
static int opencl_close_files(struct opencl_info *info)
{
	int r = 0;

	if (info->kernel_c) {
		r = opencl_write_kernel_file(info);
		fclose(info->kernel_c);
	}
	if (info->host_c)
		fclose(info->host_c);

	return r;
}

static __isl_give isl_printer *opencl_print_host_macros(
	__isl_take isl_printer *p)
{
	const char *macros =
		"#define openclCheckReturn(ret) \\\n"
		"  if (ret != CL_SUCCESS) {\\\n"
		"    fprintf(stderr, \"OpenCL error: %s\\n\", "
		"opencl_error_string(ret)); \\\n"
		"    fflush(stderr); \\\n"
		"    assert(ret == CL_SUCCESS);\\\n  }\n";

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, macros);
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *opencl_declare_device_arrays(
	__isl_take isl_printer *p, struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		if (!gpu_array_requires_device_allocation(&prog->array[i]))
			continue;
		p = isl_printer_start_line(p);
		p = isl_printer_print_str(p, "cl_mem dev_");
		p = isl_printer_print_str(p, prog->array[i].name);
		p = isl_printer_print_str(p, ";");
		p = isl_printer_end_line(p);
	}
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);
	return p;
}

/* Given an array, check whether its positive size guard expression is
 * trivial.
 */
static int is_array_positive_size_guard_trivial(struct gpu_array_info *array)
{
	isl_set *guard;
	int is_trivial;

	guard = gpu_array_positive_size_guard(array);
	is_trivial = isl_set_plain_is_universe(guard);
	isl_set_free(guard);
	return is_trivial;
}

/* Allocate a device array for "array'.
 *
 * Emit a max-expression to ensure the device array can contain at least one
 * element if the array's positive size guard expression is not trivial.
 */
static __isl_give isl_printer *allocate_device_array(__isl_take isl_printer *p,
	struct gpu_array_info *array)
{
	int need_lower_bound;

	need_lower_bound = !is_array_positive_size_guard_trivial(array);
	if (need_lower_bound)
		p = ppcg_print_macro(isl_ast_op_max, p);

	p = ppcg_ast_expr_print_macros(array->bound_expr, p);
	p = ppcg_start_block(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "dev_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, " = clCreateBuffer(context, ");
	p = isl_printer_print_str(p, "CL_MEM_READ_WRITE, ");

	if (need_lower_bound) {
		p = isl_printer_print_str(p, ppcg_max);
		p = isl_printer_print_str(p, "(sizeof(");
		p = isl_printer_print_str(p, array->type);
		p = isl_printer_print_str(p, "), ");
	}
	p = gpu_array_info_print_size(p, array);
	if (need_lower_bound)
		p = isl_printer_print_str(p, ")");

	p = isl_printer_print_str(p, ", NULL, &err);");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(err);");
	p = isl_printer_end_line(p);

	p = ppcg_end_block(p);

	return p;
}

/* Allocate accessed device arrays.
 */
static __isl_give isl_printer *opencl_allocate_device_arrays(
	__isl_take isl_printer *p, struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];

		if (!gpu_array_requires_device_allocation(array))
			continue;

		p = allocate_device_array(p, array);
	}
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);
	return p;
}

/* Free the device array corresponding to "array"
 */
static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
	struct gpu_array_info *array)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn("
					"clReleaseMemObject(dev_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, "));");
	p = isl_printer_end_line(p);

	return p;
}

/* Free the accessed device arrays.
 */
static __isl_give isl_printer *opencl_release_device_arrays(
	__isl_take isl_printer *p, struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i) {
		struct gpu_array_info *array = &prog->array[i];
		if (!gpu_array_requires_device_allocation(array))
			continue;

		p = release_device_array(p, array);
	}
	return p;
}

/* Create an OpenCL device, context, command queue and build the kernel.
 * input is the name of the input file provided to ppcg.
 */
static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
	const char *input, struct opencl_info *info)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_device_id device;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_context context;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_program program;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_command_queue queue;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_int err;");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "device = opencl_create_device(");
	p = isl_printer_print_int(p, info->options->opencl_use_gpu);
	p = isl_printer_print_str(p, ");");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, "
		"&device, NULL, NULL, &err);");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(err);");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
					"(context, device, 0, &err);");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(err);");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "program = ");

	if (info->options->opencl_embed_kernel_code) {
		p = isl_printer_print_str(p, "opencl_build_program_from_string("
						"context, device, kernel_code, "
						"sizeof(kernel_code), \"");
	} else {
		p = isl_printer_print_str(p, "opencl_build_program_from_file("
						"context, device, \"");
		p = isl_printer_print_str(p, info->kernel_c_name);
		p = isl_printer_print_str(p, "\", \"");
	}

	if (info->options->opencl_compiler_options)
		p = isl_printer_print_str(p,
					info->options->opencl_compiler_options);

	p = isl_printer_print_str(p, "\");");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *opencl_release_cl_objects(
	__isl_take isl_printer *p, struct opencl_info *info)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
					"(queue));");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
					"(program));");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
					"(context));");
	p = isl_printer_end_line(p);

	return p;
}

/* Print a call to the OpenCL clSetKernelArg() function which sets
 * the arguments of the kernel.  arg_name and arg_index are the name and the
 * index of the kernel argument.  The index of the leftmost argument of
 * the kernel is 0 whereas the index of the rightmost argument of the kernel
 * is n - 1, where n is the total number of the kernel arguments.
 * read_only_scalar is a boolean that indicates whether the argument is a read
 * only scalar.
 */
static __isl_give isl_printer *opencl_set_kernel_argument(
	__isl_take isl_printer *p, int kernel_id,
	const char *arg_name, int arg_index, int read_only_scalar)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p,
		"openclCheckReturn(clSetKernelArg(kernel");
	p = isl_printer_print_int(p, kernel_id);
	p = isl_printer_print_str(p, ", ");
	p = isl_printer_print_int(p, arg_index);
	p = isl_printer_print_str(p, ", sizeof(");

	if (read_only_scalar) {
		p = isl_printer_print_str(p, arg_name);
		p = isl_printer_print_str(p, "), &");
	} else
		p = isl_printer_print_str(p, "cl_mem), (void *) &dev_");

	p = isl_printer_print_str(p, arg_name);
	p = isl_printer_print_str(p, "));");
	p = isl_printer_end_line(p);

	return p;
}

/* Print the block sizes as a list of the sizes in each
 * dimension.
 */
static __isl_give isl_printer *opencl_print_block_sizes(
	__isl_take isl_printer *p, struct ppcg_kernel *kernel)
{
	int i;

	if (kernel->n_block > 0)
		for (i = 0; i < kernel->n_block; ++i) {
			if (i)
				p = isl_printer_print_str(p, ", ");
			p = isl_printer_print_int(p, kernel->block_dim[i]);
		}
	else
		p = isl_printer_print_str(p, "1");

	return p;
}

/* Set the arguments of the OpenCL kernel by printing a call to the OpenCL
 * clSetKernelArg() function for each kernel argument.
 */
static __isl_give isl_printer *opencl_set_kernel_arguments(
	__isl_take isl_printer *p, struct gpu_prog *prog,
	struct ppcg_kernel *kernel)
{
	int i, n, ro;
	unsigned nparam;
	isl_space *space;
	int arg_index = 0;

	for (i = 0; i < prog->n_array; ++i) {
		int required;

		required = ppcg_kernel_requires_array_argument(kernel, i);
		if (required < 0)
			return isl_printer_free(p);
		if (!required)
			continue;
		ro = gpu_array_is_read_only_scalar(&prog->array[i]);
		opencl_set_kernel_argument(p, kernel->id, prog->array[i].name,
			arg_index, ro);
		arg_index++;
	}

	space = isl_union_set_get_space(kernel->arrays);
	nparam = isl_space_dim(space, isl_dim_param);
	for (i = 0; i < nparam; ++i) {
		const char *name;

		name = isl_space_get_dim_name(space, isl_dim_param, i);
		opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
		arg_index++;
	}
	isl_space_free(space);

	n = isl_space_dim(kernel->space, isl_dim_set);
	for (i = 0; i < n; ++i) {
		const char *name;

		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
		opencl_set_kernel_argument(p, kernel->id, name, arg_index, 1);
		arg_index++;
	}

	return p;
}

/* Print the arguments to a kernel declaration or call.  If "types" is set,
 * then print a declaration (including the types of the arguments).
 *
 * The arguments are printed in the following order
 * - the arrays accessed by the kernel
 * - the parameters
 * - the host loop iterators
 */
static __isl_give isl_printer *opencl_print_kernel_arguments(
	__isl_take isl_printer *p, struct gpu_prog *prog,
	struct ppcg_kernel *kernel, int types)
{
	int i, n;
	int first = 1;
	unsigned nparam;
	isl_space *space;
	const char *type;

	for (i = 0; i < prog->n_array; ++i) {
		int required;

		required = ppcg_kernel_requires_array_argument(kernel, i);
		if (required < 0)
			return isl_printer_free(p);
		if (!required)
			continue;

		if (!first)
			p = isl_printer_print_str(p, ", ");

		if (types)
			p = gpu_array_info_print_declaration_argument(p,
				&prog->array[i], "__global");
		else
			p = gpu_array_info_print_call_argument(p,
				&prog->array[i]);

		first = 0;
	}

	space = isl_union_set_get_space(kernel->arrays);
	nparam = isl_space_dim(space, isl_dim_param);
	for (i = 0; i < nparam; ++i) {
		const char *name;

		name = isl_space_get_dim_name(space, isl_dim_param, i);

		if (!first)
			p = isl_printer_print_str(p, ", ");
		if (types)
			p = isl_printer_print_str(p, "int ");
		p = isl_printer_print_str(p, name);

		first = 0;
	}
	isl_space_free(space);

	n = isl_space_dim(kernel->space, isl_dim_set);
	type = isl_options_get_ast_iterator_type(prog->ctx);
	for (i = 0; i < n; ++i) {
		const char *name;

		if (!first)
			p = isl_printer_print_str(p, ", ");
		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
		if (types) {
			p = isl_printer_print_str(p, type);
			p = isl_printer_print_str(p, " ");
		}
		p = isl_printer_print_str(p, name);

		first = 0;
	}

	return p;
}

/* Print the header of the given kernel.
 */
static __isl_give isl_printer *opencl_print_kernel_header(
	__isl_take isl_printer *p, struct gpu_prog *prog,
	struct ppcg_kernel *kernel)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "__kernel void kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "(");
	p = opencl_print_kernel_arguments(p, prog, kernel, 1);
	p = isl_printer_print_str(p, ")");
	p = isl_printer_end_line(p);

	return p;
}

/* Print a list of iterators of type "type" with names "ids" to "p".
 * Each iterator is assigned the corresponding opencl identifier returned
 * by the function "opencl_id".
 * Unlike the equivalent function in the CUDA backend which prints iterators
 * in reverse order to promote coalescing, this function does not print
 * iterators in reverse order.  The OpenCL backend currently does not take
 * into account any coalescing considerations.
 */
static __isl_give isl_printer *print_iterators(__isl_take isl_printer *p,
	const char *type, __isl_keep isl_id_list *ids, const char *opencl_id)
{
	int i, n;

	n = isl_id_list_n_id(ids);
	if (n <= 0)
		return p;
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, type);
	p = isl_printer_print_str(p, " ");
	for (i = 0; i < n; ++i) {
		isl_id *id;

		if (i)
			p = isl_printer_print_str(p, ", ");
		id = isl_id_list_get_id(ids, i);
		p = isl_printer_print_id(p, id);
		isl_id_free(id);
		p = isl_printer_print_str(p, " = ");
		p = isl_printer_print_str(p, opencl_id);
		p = isl_printer_print_str(p, "(");
		p = isl_printer_print_int(p, i);
		p = isl_printer_print_str(p, ")");
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *opencl_print_kernel_iterators(
	__isl_take isl_printer *p, struct ppcg_kernel *kernel)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	const char *type;

	type = isl_options_get_ast_iterator_type(ctx);

	p = print_iterators(p, type, kernel->block_ids, "get_group_id");
	p = print_iterators(p, type, kernel->thread_ids, "get_local_id");

	return p;
}

static __isl_give isl_printer *opencl_print_kernel_var(
	__isl_take isl_printer *p, struct ppcg_kernel_var *var)
{
	int j;
	isl_val *v;

	p = isl_printer_start_line(p);
	if (var->type == ppcg_access_shared)
		p = isl_printer_print_str(p, "__local ");
	p = isl_printer_print_str(p, var->array->type);
	p = isl_printer_print_str(p, " ");
	p = isl_printer_print_str(p, var->name);
	for (j = 0; j < var->array->n_index; ++j) {
		p = isl_printer_print_str(p, "[");
		v = isl_vec_get_element_val(var->size, j);
		p = isl_printer_print_val(p, v);
		p = isl_printer_print_str(p, "]");
		isl_val_free(v);
	}
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

static __isl_give isl_printer *opencl_print_kernel_vars(
		__isl_take isl_printer *p, struct ppcg_kernel *kernel)
{
	int i;

	for (i = 0; i < kernel->n_var; ++i)
		p = opencl_print_kernel_var(p, &kernel->var[i]);

	return p;
}

/* Print a call to barrier() which is a sync statement.
 * All work-items in a work-group executing the kernel on a processor must
 * execute the barrier() function before any are allowed to continue execution
 * beyond the barrier.
 * The flag CLK_LOCAL_MEM_FENCE makes the barrier function either flush any
 * variables stored in local memory or queue a memory fence to ensure correct
 * ordering of memory operations to local memory.
 * The flag CLK_GLOBAL_MEM_FENCE makes the barrier function queue a memory
 * fence to ensure correct ordering of memory operations to global memory.
 */
static __isl_give isl_printer *opencl_print_sync(__isl_take isl_printer *p,
	struct ppcg_kernel_stmt *stmt)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p,
		"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
	p = isl_printer_end_line(p);

	return p;
}

/* Data structure containing function names for which the calls
 * should be changed from
 *
 *	name(arg)
 *
 * to
 *
 *	opencl_name((type) (arg))
 */
static struct ppcg_opencl_fn {
	const char *name;
	const char *opencl_name;
	const char *type;
} opencl_fn[] = {
	{ "expf",	"exp",		"float" },
	{ "powf",	"pow",		"float" },
	{ "sqrtf",	"sqrt",		"float" },
};

#define ARRAY_SIZE(array) (sizeof(array)/sizeof(*array))

/* If the name of function called by "expr" matches any of those
 * in ppcg_opencl_fn, then replace the call by a cast to the corresponding
 * type in ppcg_opencl_fn and a call to corresponding OpenCL function.
 */
static __isl_give pet_expr *map_opencl_call(__isl_take pet_expr *expr,
	void *user)
{
	const char *name;
	int i;

	name = pet_expr_call_get_name(expr);
	for (i = 0; i < ARRAY_SIZE(opencl_fn); ++i) {
		pet_expr *arg;

		if (strcmp(name, opencl_fn[i].name))
			continue;
		expr = pet_expr_call_set_name(expr, opencl_fn[i].opencl_name);
		arg = pet_expr_get_arg(expr, 0);
		arg = pet_expr_new_cast(opencl_fn[i].type, arg);
		expr = pet_expr_set_arg(expr, 0, arg);
	}
	return expr;
}

/* Print the body of a statement from the input program,
 * for use in OpenCL code.
 *
 * Before calling ppcg_kernel_print_domain to print the actual statement body,
 * we first modify this body to take into account that the output code
 * is OpenCL code.  In particular, if the statement calls any function
 * with a "f" suffix, then it needs to be replaced by a call to
 * the corresponding function without suffix after casting the argument
 * to a float.
 */
static __isl_give isl_printer *print_opencl_kernel_domain(
	__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
{
	struct pet_stmt *ps;
	pet_tree *tree;

	ps = stmt->u.d.stmt->stmt;
	tree = pet_tree_copy(ps->body);
	ps->body = pet_tree_map_call_expr(ps->body, &map_opencl_call, NULL);
	p = ppcg_kernel_print_domain(p, stmt);
	pet_tree_free(ps->body);
	ps->body = tree;

	return p;
}

/* This function is called for each user statement in the AST,
 * i.e., for each kernel body statement, copy statement or sync statement.
 */
static __isl_give isl_printer *opencl_print_kernel_stmt(
	__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	struct ppcg_kernel_stmt *stmt;

	id = isl_ast_node_get_annotation(node);
	stmt = isl_id_get_user(id);
	isl_id_free(id);

	isl_ast_print_options_free(print_options);

	switch (stmt->type) {
	case ppcg_kernel_copy:
		return ppcg_kernel_print_copy(p, stmt);
	case ppcg_kernel_sync:
		return opencl_print_sync(p, stmt);
	case ppcg_kernel_domain:
		return print_opencl_kernel_domain(p, stmt);
	}

	return p;
}

/* Return true if there is a double array in prog->array or
 * if any of the types in prog->scop involve any doubles.
 * To check the latter condition, we simply search for the string "double"
 * in the type definitions, which may result in false positives.
 */
static __isl_give int any_double_elements(struct gpu_prog *prog)
{
	int i;

	for (i = 0; i < prog->n_array; ++i)
		if (strcmp(prog->array[i].type, "double") == 0)
			return 1;

	for (i = 0; i < prog->scop->pet->n_type; ++i) {
		struct pet_type *type = prog->scop->pet->types[i];

		if (strstr(type->definition, "double"))
			return 1;
	}

	return 0;
}

/* Prints a #pragma to enable support for double floating-point
 * precision.  OpenCL 1.0 adds support for double precision floating-point as
 * an optional extension. An application that wants to use double will need to
 * include the #pragma OPENCL EXTENSION cl_khr_fp64 : enable directive before
 * any double precision data type is declared in the kernel code.
 */
static __isl_give isl_printer *opencl_enable_double_support(
	__isl_take isl_printer *p)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "#pragma OPENCL EXTENSION cl_khr_fp64 :"
		" enable");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);

	return p;
}

/* Macro definitions for ppcg_min and ppcg_max for use
 * in OpenCL kernel code.
 * These macro definitions essentially call the corresponding
 * OpenCL macros/functions, but first ensure that the two arguments
 * have the same type, since the OpenCL versions are only defined
 * in case those arguments have the same type.
 */
static const char *opencl_min =
	"(x,y)    min((__typeof__(x + y)) x, (__typeof__(x + y)) y)";
static const char *opencl_max =
	"(x,y)    max((__typeof__(x + y)) x, (__typeof__(x + y)) y)";

/* Set the macro definitions for ppcg_min and ppcg_max to
 * OpenCL specific versions.
 */
static __isl_give isl_printer *set_opencl_macros(__isl_take isl_printer *p)
{
	return ppcg_set_macros(p, opencl_min, opencl_max);
}

static __isl_give isl_printer *opencl_print_kernel(struct gpu_prog *prog,
	struct ppcg_kernel *kernel, __isl_take isl_printer *p)
{
	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
	isl_ast_print_options *print_options;

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
				&opencl_print_kernel_stmt, NULL);

	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
	p = opencl_print_kernel_header(p, prog, kernel);
	p = isl_printer_print_str(p, "{");
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, 2);
	p = opencl_print_kernel_iterators(p, kernel);
	p = opencl_print_kernel_vars(p, kernel);
	p = isl_printer_end_line(p);
	p = ppcg_set_macro_names(p);
	p = set_opencl_macros(p);
	p = gpu_print_macros(p, kernel->tree);
	p = isl_ast_node_print(kernel->tree, p, print_options);
	p = isl_printer_indent(p, -2);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "}");
	p = isl_printer_end_line(p);

	return p;
}

struct print_host_user_data_opencl {
	struct opencl_info *opencl;
	struct gpu_prog *prog;
};

/* This function prints the i'th block size multiplied by the i'th grid size,
 * where i (a parameter to this function) is one of the possible dimensions of
 * grid sizes and block sizes.
 * If the dimension of block sizes is not equal to the dimension of grid sizes
 * the output is calculated as follows:
 *
 * Suppose that:
 * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 *
 * The output is:
 * If (i > dim2) then the output is block_sizes[i]
 * If (i > dim1) then the output is grid_sizes[i]
 */
static __isl_give isl_printer *opencl_print_total_number_of_work_items_for_dim(
	__isl_take isl_printer *p, struct ppcg_kernel *kernel, int i)
{
	int grid_dim, block_dim;
	isl_ast_expr *grid_size_expr;
	isl_ast_expr *bound_grid;

	grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
	block_dim = kernel->n_block;

	if (i < min(grid_dim, block_dim)) {
		grid_size_expr = kernel->grid_size_expr;
		bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i);
		p = isl_printer_print_str(p, "(");
		p = isl_printer_print_ast_expr(p, bound_grid);
		p = isl_printer_print_str(p, ") * ");
		p = isl_printer_print_int(p, kernel->block_dim[i]);
		isl_ast_expr_free(bound_grid);
	} else if (i >= grid_dim) {
		p = isl_printer_print_int(p, kernel->block_dim[i]);
	} else {
		grid_size_expr = kernel->grid_size_expr;
		bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i);
		p = isl_printer_print_ast_expr(p, bound_grid);
		isl_ast_expr_free(bound_grid);
	}

	return p;
}

/* Print a list that represents the total number of work items.  The list is
 * constructed by performing an element-wise multiplication of the block sizes
 * and the grid sizes.  To explain how the list is constructed, suppose that:
 * block_sizes[dim1] is the list of blocks sizes and it contains dim1 elements.
 * grid_sizes[dim2] is the list of grid sizes and it contains dim2 elements.
 *
 * The output of this function is constructed as follows:
 * If (dim1 > dim2) then the output is the following list:
 * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim2-1]*block_sizes[dim2-1],
 * block_sizes[dim2], ..., block_sizes[dim1-2], block_sizes[dim1-1].
 *
 * If (dim2 > dim1) then the output is the following list:
 * grid_sizes[0]*block_sizes[0], ..., grid_sizes[dim1-1] * block_sizes[dim1-1],
 * grid_sizes[dim1], grid_sizes[dim2-2], grid_sizes[dim2-1].
 *
 * To calculate the total number of work items out of the list constructed by
 * this function, the user should multiply the elements of the list.
 */
static __isl_give isl_printer *opencl_print_total_number_of_work_items_as_list(
	__isl_take isl_printer *p, struct ppcg_kernel *kernel)
{
	int i;
	int grid_dim, block_dim;

	grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
	block_dim = kernel->n_block;

	if ((grid_dim <= 0) || (block_dim <= 0)) {
		p = isl_printer_print_str(p, "1");
		return p;
	}

	for (i = 0; i <= max(grid_dim, block_dim) - 1; i++) {
		if (i > 0)
			p = isl_printer_print_str(p, ", ");

		p = opencl_print_total_number_of_work_items_for_dim(p,
			kernel, i);
	}

	return p;
}

/* Copy "array" from the host to the device (to_host = 0) or
 * back from the device to the host (to_host = 1).
 */
static __isl_give isl_printer *copy_array(__isl_take isl_printer *p,
	struct gpu_array_info *array, int to_host)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(");
	if (to_host)
		p = isl_printer_print_str(p, "clEnqueueReadBuffer");
	else
		p = isl_printer_print_str(p, "clEnqueueWriteBuffer");
	p = isl_printer_print_str(p, "(queue, dev_");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", CL_TRUE, 0, ");
	p = gpu_array_info_print_size(p, array);

	if (gpu_array_is_scalar(array))
		p = isl_printer_print_str(p, ", &");
	else
		p = isl_printer_print_str(p, ", ");
	p = isl_printer_print_str(p, array->name);
	p = isl_printer_print_str(p, ", 0, NULL, NULL));");
	p = isl_printer_end_line(p);

	return p;
}

/* Print code for initializing the device for execution of the transformed
 * code.  This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
	struct gpu_prog *prog, struct opencl_info *opencl)
{
	p = opencl_print_host_macros(p);

	p = gpu_print_local_declarations(p, prog);
	p = opencl_declare_device_arrays(p, prog);
	p = opencl_setup(p, opencl->input, opencl);
	p = opencl_allocate_device_arrays(p, prog);

	return p;
}

/* Print code for clearing the device after execution of the transformed code.
 * In particular, free the memory that was allocated on the device.
 */
static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
	struct gpu_prog *prog, struct opencl_info *opencl)
{
	p = opencl_release_device_arrays(p, prog);
	p = opencl_release_cl_objects(p, opencl);

	return p;
}

/* Print a statement for copying an array to or from the device,
 * or for initializing or clearing the device.
 * The statement identifier of a copying node is called
 * "to_device_<array name>" or "from_device_<array name>" and
 * its user pointer points to the gpu_array_info of the array
 * that needs to be copied.
 * The node for initializing the device is called "init_device".
 * The node for clearing the device is called "clear_device".
 *
 * Extract the array (if any) from the identifier and call
 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 */
static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node, struct gpu_prog *prog,
	struct opencl_info *opencl)
{
	isl_ast_expr *expr, *arg;
	isl_id *id;
	const char *name;
	struct gpu_array_info *array;

	expr = isl_ast_node_user_get_expr(node);
	arg = isl_ast_expr_get_op_arg(expr, 0);
	id = isl_ast_expr_get_id(arg);
	name = isl_id_get_name(id);
	array = isl_id_get_user(id);
	isl_id_free(id);
	isl_ast_expr_free(arg);
	isl_ast_expr_free(expr);

	if (!name)
		return isl_printer_free(p);
	if (!strcmp(name, "init_device"))
		return init_device(p, prog, opencl);
	if (!strcmp(name, "clear_device"))
		return clear_device(p, prog, opencl);
	if (!array)
		return isl_printer_free(p);

	if (!prefixcmp(name, "to_device"))
		return copy_array(p, array, 0);
	else
		return copy_array(p, array, 1);
}

/* Print the user statement of the host code to "p".
 *
 * The host code may contain original user statements, kernel launches,
 * statements that copy data to/from the device and statements
 * the initialize or clear the device.
 * The original user statements and the kernel launches have
 * an associated annotation, while the other statements do not.
 * The latter are handled by print_device_node.
 * The annotation on the user statements is called "user".
 *
 * In case of a kernel launch, print a block of statements that
 * defines the grid and the work group and then launches the kernel.
 *
 * A grid is composed of many work groups (blocks), each work group holds
 * many work-items (threads).
 *
 * global_work_size[kernel->n_block] represents the total number of work
 * items.  It points to an array of kernel->n_block unsigned
 * values that describe the total number of work-items that will execute
 * the kernel.  The total number of work-items is computed as:
 * global_work_size[0] *...* global_work_size[kernel->n_block - 1].
 *
 * The size of each work group (i.e. the number of work-items in each work
 * group) is described using block_size[kernel->n_block].  The total
 * number of work-items in a block (work-group) is computed as:
 * block_size[0] *... * block_size[kernel->n_block - 1].
 *
 * For more information check:
 * http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
 */
static __isl_give isl_printer *opencl_print_host_user(
	__isl_take isl_printer *p,
	__isl_take isl_ast_print_options *print_options,
	__isl_keep isl_ast_node *node, void *user)
{
	isl_id *id;
	int is_user;
	struct ppcg_kernel *kernel;
	struct ppcg_kernel_stmt *stmt;
	struct print_host_user_data_opencl *data;

	isl_ast_print_options_free(print_options);

	data = (struct print_host_user_data_opencl *) user;

	id = isl_ast_node_get_annotation(node);
	if (!id)
		return print_device_node(p, node, data->prog, data->opencl);

	is_user = !strcmp(isl_id_get_name(id), "user");
	kernel = is_user ? NULL : isl_id_get_user(id);
	stmt = is_user ? isl_id_get_user(id) : NULL;
	isl_id_free(id);

	if (is_user)
		return ppcg_kernel_print_domain(p, stmt);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "{");
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, 2);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "size_t global_work_size[");

	if (kernel->n_block > 0)
		p = isl_printer_print_int(p, kernel->n_block);
	else
		p = isl_printer_print_int(p, 1);

	p = isl_printer_print_str(p, "] = {");
	p = opencl_print_total_number_of_work_items_as_list(p, kernel);
	p = isl_printer_print_str(p, "};");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "size_t block_size[");

	if (kernel->n_block > 0)
		p = isl_printer_print_int(p, kernel->n_block);
	else
		p = isl_printer_print_int(p, 1);

	p = isl_printer_print_str(p, "] = {");
	p = opencl_print_block_sizes(p, kernel);
	p = isl_printer_print_str(p, "};");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "cl_kernel kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, " = clCreateKernel(program, \"kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "\", &err);");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(err);");
	p = isl_printer_end_line(p);

	opencl_set_kernel_arguments(p, data->prog, kernel);

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn(clEnqueueNDRangeKernel"
		"(queue, kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, ", ");
	if (kernel->n_block > 0)
		p = isl_printer_print_int(p, kernel->n_block);
	else
		p = isl_printer_print_int(p, 1);

	p = isl_printer_print_str(p, ", NULL, global_work_size, "
					"block_size, "
					"0, NULL, NULL));");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "openclCheckReturn("
					"clReleaseKernel(kernel");
	p = isl_printer_print_int(p, kernel->id);
	p = isl_printer_print_str(p, "));");
	p = isl_printer_end_line(p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "clFinish(queue);");
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, -2);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "}");
	p = isl_printer_end_line(p);

	p = isl_printer_start_line(p);
	p = isl_printer_end_line(p);

	data->opencl->kprinter = opencl_print_kernel(data->prog, kernel,
						data->opencl->kprinter);

	return p;
}

static __isl_give isl_printer *opencl_print_host_code(
	__isl_take isl_printer *p, struct gpu_prog *prog,
	__isl_keep isl_ast_node *tree, struct opencl_info *opencl)
{
	isl_ast_print_options *print_options;
	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
	struct print_host_user_data_opencl data = { opencl, prog };

	print_options = isl_ast_print_options_alloc(ctx);
	print_options = isl_ast_print_options_set_print_user(print_options,
				&opencl_print_host_user, &data);

	p = gpu_print_macros(p, tree);
	p = isl_ast_node_print(tree, p, print_options);

	return p;
}

/* Given a gpu_prog "prog" and the corresponding transformed AST
 * "tree", print the entire OpenCL code to "p".
 */
static __isl_give isl_printer *print_opencl(__isl_take isl_printer *p,
	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
	struct gpu_types *types, void *user)
{
	struct opencl_info *opencl = user;

	opencl->kprinter = isl_printer_set_output_format(opencl->kprinter,
							ISL_FORMAT_C);
	if (any_double_elements(prog))
		opencl->kprinter = opencl_enable_double_support(
							opencl->kprinter);
	if (opencl->options->opencl_print_kernel_types)
		opencl->kprinter = gpu_print_types(opencl->kprinter, types,
								prog);

	if (!opencl->kprinter)
		return isl_printer_free(p);

	p = opencl_print_host_code(p, prog, tree, opencl);

	return p;
}

/* Transform the code in the file called "input" by replacing
 * all scops by corresponding OpenCL code.
 * The host code is written to "output" or a name derived from
 * "input" if "output" is NULL.
 * The kernel code is placed in separate files with names
 * derived from "output" or "input".
 *
 * We let generate_gpu do all the hard work and then let it call
 * us back for printing the AST in print_opencl.
 *
 * To prepare for this printing, we first open the output files
 * and we close them after generate_gpu has finished.
 */
int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
	const char *input, const char *output)
{
	struct opencl_info opencl = { options, input, output };
	int r;

	opencl.kprinter = isl_printer_to_str(ctx);
	r = opencl_open_files(&opencl);

	if (r >= 0)
		r = generate_gpu(ctx, input, opencl.host_c, options,
				&print_opencl, &opencl);

	if (opencl_close_files(&opencl) < 0)
		r = -1;
	isl_printer_free(opencl.kprinter);

	return r;
}


================================================
FILE: src/ppcg_files/opencl.h
================================================
#ifndef _OPENCL_H
#define _OPENCL_H

#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

	int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
											const char *input, const char *output);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/ppcg_options.c
================================================
/*
 * Copyright 2010-2011 INRIA Saclay
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 */

#include "ppcg_options.h"

static struct isl_arg_choice target[] = {
	//{"c", PPCG_TARGET_C},
	//{"cuda", PPCG_TARGET_CUDA},
	//{"opencl", PPCG_TARGET_OPENCL},
	//{"autosa_c", AUTOSA_TARGET_C},
	{"autosa_hls_c", AUTOSA_TARGET_XILINX_HLS_C},
	{"autosa_opencl", AUTOSA_TARGET_INTEL_OPENCL},
	//{"autosa_t2s", AUTOSA_TARGET_T2S},
	{"autosa_catapult_c", AUTOSA_TARGET_CATAPULT_HLS_C},
	{"autosa_tapa", AUTOSA_TARGET_TAPA_CPP},
	{0}};

static struct isl_arg_choice sa_type[] = {
	{"sync", AUTOSA_SA_TYPE_SYNC},
	{"async", AUTOSA_SA_TYPE_ASYNC},
	{0}};

/* Set defaults that depend on the target.
 * In particular, set --schedule-outer-coincidence iff target is a GPU.
 */
void ppcg_options_set_target_defaults(struct ppcg_options *options)
{
	char *argv[2] = {NULL};

	argv[0] = "ppcg_options_set_target_defaults";
	if (options->target == PPCG_TARGET_C)
		argv[1] = "--no-schedule-outer-coincidence";
	else
		argv[1] = "--schedule-outer-coincidence";

	isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
}

/* Callback that is called whenever the "target" option is set (to "val").
 * The callback is called after target has been updated.
 *
 * Call ppcg_options_set_target_defaults to reset the target-dependent options.
 */
static int set_target(void *opt, unsigned val)
{
	struct ppcg_options *options = opt;

	ppcg_options_set_target_defaults(options);

	return 0;
}

ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
			 "dump-schedule-constraints", 0, "dump schedule constraints")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0,
			 "dump-schedule", 0, "dump isl computed schedule")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0,
			 "dump-final-schedule", 0, "dump PPCG computed schedule")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0,
			 "dump-sizes", 0,
			 "dump effectively used per kernel tile, grid and block sizes")
ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL)
ISL_ARGS_END

//ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args)
//ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options",
//			"options", NULL, "options to pass to the OpenCL compiler")
//ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1,
//			 "use GPU device (if available)")
//ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file,
//				 opencl_include_files, 0, "include-file", "filename",
//				 "file to #include in generated OpenCL code")
//ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0,
//			 "print-kernel-types", 1,
//			 "print definitions of types in the kernel file")
//ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0,
//			 "embed-kernel-code", 0, "embed kernel code into host code")
//ISL_ARGS_END

ISL_ARGS_START(struct autosa_options, autosa_options_args)
ISL_ARG_BOOL(struct autosa_options, autosa, 0, "autosa", 1,
				"generate systolic arrays using AutoSA")
ISL_ARG_BOOL(struct autosa_options, array_contraction, 0, "array-contraction", 1,
				"apply array contraction")
ISL_ARG_BOOL(struct autosa_options, axi_stream, 0, "axi-stream", 0,
				"generate AXI stream interface, must be used together with host serialization.")
ISL_ARG_BOOL(struct autosa_options, block_sparse, 0, "block-sparse", 0,
				"use block sparsity")
ISL_ARG_STR(struct autosa_options, block_sparse_ratio, 0, "block-sparse-ratio", "ratio",
				NULL, "block sparsity ratio (e.g., kernel[]->A[2,4])")
ISL_ARG_STR(struct autosa_options, config, 0, "config", "config", NULL,
				"AutoSA configuration file")
ISL_ARG_BOOL(struct autosa_options, credit_control, 0, "credit-control", 0,
			 	"enable credit control between different array partitions")
ISL_ARG_BOOL(struct autosa_options, data_pack, 0, "data-pack", 1,
			 	"enable data packing")
ISL_ARG_STR(struct autosa_options, data_pack_sizes, 0, "data-pack-sizes", "sizes",
				NULL, "data pack sizes upper bound (bytes) at innermost, intermediate, outermost I/O level [default: kernel[]->data_pack[8,32,64]]")
ISL_ARG_BOOL(struct autosa_options, double_buffer, 0, "double-buffer", 1,
			 	"enable double-buffering for data transfer")
ISL_ARG_STR(struct autosa_options, double_buffer_assignment, 0, "double-buffer-assign", "assignment",
				NULL, "assign arrays to be double bufferred (e.g., kernel[]->A[])")
ISL_ARG_INT(struct autosa_options, double_buffer_style, 0, "double-buffer-style", "id", 1,
				"change double-buffering logic coding style (0: while loop 1: for loop)")
ISL_ARG_BOOL(struct autosa_options, dump_code, 0, "dump-code", 0,
			 	"dump the intermediate code")
ISL_ARG_BOOL(struct autosa_options, explore_loop_permute, 0, "explore-loop-permute", 0,
				"explore loop permutation in the step of array partitioning")
ISL_ARG_INT(struct autosa_options, loop_permute_order, 0, "loop-permute-order", "order", 0,
				"specify which loop ordering to be explored")
ISL_ARG_INT(struct autosa_options, fifo_depth, 0, "fifo-depth", "depth", 2, "default FIFO depth")
ISL_ARG_BOOL(struct autosa_options, hbm, 0, "hbm", 0,
			 	"use multi-port DRAM/HBM")
ISL_ARG_INT(struct autosa_options, n_hbm_port, 0, "hbm-port-num", "num", 2,
				"default HBM port number per array")
ISL_ARG_BOOL(struct autosa_options, hls, 0, "hls", 0,
			 	"generate Xilinx HLS host")
ISL_ARG_BOOL(struct autosa_options, host_serialize, 0, "host-serialize", 0,
			 	"serialize/deserialize the host data")
ISL_ARG_BOOL(struct autosa_options, insert_hls_dependence, 0, "insert-hls-dependence", 0,
			 	"insert Xilinx HLS dependence pragma (alpha version)")
ISL_ARG_INT(struct autosa_options, int_io_dir, 0, "int-io-dir", "dir", 0,
			 	"set the default interior I/O direction (0: [1,x] 1: [x,1])")
ISL_ARG_BOOL(struct autosa_options, io_module_embedding, 0, "io-module-embedding", 0,
			 	"embed the I/O modules inside PEs if possible")
ISL_ARG_BOOL(struct autosa_options, isl_sink, 0, "isl-sink", 1,
			 	"sink time loops using ISL default APIs")
ISL_ARG_BOOL(struct autosa_options, loop_infinitize, 0, "loop-infinitize", 0,
			 	"apply loop infinitization optimization (Intel OpenCL only)")
ISL_ARG_BOOL(struct autosa_options, local_reduce, 0, "local-reduce", 0,
			 	"generate non-output-stationary array with local reduction")
ISL_ARG_STR(struct autosa_options, reduce_op, 0, "reduce-op", "op",
				NULL, "reduction operator (must be used with local-reduce together)")			 
ISL_ARG_BOOL(struct autosa_options, lower_int_io_L1_buffer, 0, "lower-int-io-L1-buffer", 0,
			 	"lower the L1 buffer for interior I/O modules")
ISL_ARG_BOOL(struct autosa_options, lower_if_branch, 0, "lower-if-branch", 0,
				"lower if branch in the I/O module")
ISL_ARG_INT(struct autosa_options, max_local_memory, 0,
				"max-local-memory", "size", 8192, "maximal amount of local memory")
ISL_ARG_INT(struct autosa_options, max_sa_dim, 0,
				"max-sa-dim", "dim", 2, "maximal systolic array dimension")			 
ISL_ARG_STR(struct autosa_options, mem_port_map, 0, "mem-port-map", "map", NULL,
				"memory port mapping")
ISL_ARG_BOOL(struct autosa_options, non_block_fifo, 0, "non-blocking-fifo", 0,
			 	"use non-blocking fifo interface")
ISL_ARG_STR(struct autosa_options, output_dir, 0, "output-dir", "dir", "./autosa.tmp/output",
				"AutoSA Output directory")
ISL_ARG_BOOL(struct autosa_options, reverse_order, 0, "reverse-order", 1,
			 	"reverse latency hiding loop tiling order")			
ISL_ARG_STR(struct autosa_options, select_rar_dep, 0, "select-rar-dep", "choice",
				NULL, "select the RAR dependence for the array access. [example: kernel[]->__pet_ref_4[1]]")
ISL_ARG_STR(struct autosa_options, sa_sizes, 0, "sa-sizes", "sizes", NULL,
				"per kernel PE optimization tile sizes")
ISL_ARG_INT(struct autosa_options, sa_tile_size, 0, "sa-tile-size", "size", 4,
				"default tile size in PE optmization")
ISL_ARG_USER_OPT_CHOICE(struct autosa_options, sa_type, 0, "sa-type", sa_type,
				NULL, AUTOSA_SA_TYPE_ASYNC, AUTOSA_SA_TYPE_ASYNC, "systolic array type")
ISL_ARG_STR(struct autosa_options, simd_info, 0, "simd-info", "info", NULL,
				"per kernel SIMD information")
ISL_ARG_BOOL(struct autosa_options, simd_touch_space, 0, "simd-touch-space", 0,
				"use space loops as SIMD vectorization loops")
ISL_ARG_INT(struct autosa_options, tuning_method, 0, "tuning-method", "method", -1,
				"tuning method (0: exhaustive search 1: others)")
ISL_ARG_BOOL(struct autosa_options, two_level_buffer, 0, "two-level-buffer", 0,
			 	"enable two-level buffering in I/O modules")
ISL_ARG_BOOL(struct autosa_options, t2s_tile, 0, "t2s-tile", 0,
			 	"generate T2S code from tiled code")
ISL_ARG_INT(struct autosa_options, t2s_tile_phase, 0,
				"t2s-tile-phase", "phase", 0, "T2S tiled URE codegen phase")
ISL_ARG_STR(struct autosa_options, param_names, 0, "param-names", "name", NULL,
				"customized parameter names (for tuning)")
ISL_ARG_BOOL(struct autosa_options, uram, 0, "uram", 0,
			 	"use Xilinx FPGA URAM")
ISL_ARG_BOOL(struct autosa_options, use_local_memory, 0, "local-memory", 1,
			 	"use local memory in kernel code")
ISL_ARG_BOOL(struct autosa_options, use_cplusplus_template, 0, "use-cplusplus-template", 0,
			 	"use C++ template in codegen (necessary for irregular PEs)")			 
ISL_ARG_BOOL(struct autosa_options, verbose, 'v', "verbose", 0,
			 	"print verbose compilation information")
ISL_ARG_BOOL(struct autosa_options, hcl, 0, "hcl", 0,
			 	"generate code for integrating with HeteroCL")			 
ISL_ARGS_END

ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
			  "debugging options")
ISL_ARG_CHILD(struct ppcg_options, autosa, "autosa", &autosa_options_args,
			  "AutoSA options")
//ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
//			 "group chains of interdependent statements that are executed "
//			 "consecutively in the original schedule before scheduling")
ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
			 "replace original schedule by isl computed schedule")
//ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
//			 "scale-tile-loops", 1, NULL)
//ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
//ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1,
//			 "use shared memory in kernel code")
//ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1,
//			 "use private memory in kernel code")
//ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL,
//			"Constraints on parameters")
//ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
//			 "assume-non-negative-parameters", 0,
//			 "assume all parameters are non-negative)")
//ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
//			 "perform tiling (C target)")
//ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
//ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
//			 0, "isolate full tiles from partial tiles (hybrid tiling)")
//ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
//			"Per kernel tile, grid and block sizes")
//ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
//			"max-shared-memory", "size", 8192, "maximal amount of shared memory")
//ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
//			 "Generate OpenMP macros (only for C target)")
ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
						&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
						"the target to generate code for")
ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
			 "linearize-device-arrays", 1,
			 "linearize all device arrays, even those of fixed size")
//ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
//			 "allow-gnu-extensions", 1,
//			 "allow the use of GNU extensions in generated code")
ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
			 "live-range-reordering", 0,
			 "allow successive live ranges on the same memory element "
			 "to be reordered")
//ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
//			 "apply hybrid tiling whenever a suitable input pattern is found "
//			 "(GPU targets)")
//ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
//			 0, "unroll code for copying to/from shared memory")
//ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
//			 "unroll code inside tile on GPU targets")
//ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
//ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
//			"file", NULL, "save isl computed schedule to <file>")
//ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule",
//			"file", NULL, "load schedule from <file>, "
//						  "using it instead of an isl computed schedule")
ISL_ARGS_END


================================================
FILE: src/ppcg_options.h
================================================
#ifndef PPCG_OPTIONS_H
#define PPCG_OPTIONS_H

#include <isl/arg.h>
#include <isl/options.h>

#ifdef __cplusplus
extern "C"
{
#endif

	struct ppcg_debug_options
	{
		int dump_schedule_constraints;
		int dump_schedule;
		int dump_final_schedule;
		int dump_sizes;
		int verbose;
	};

	struct autosa_options
	{
		/* Generate systolic array using AutoSA. */
		int autosa;
		/* Use HBM memory. */
		int hbm;
		int n_hbm_port;
		/* Enable double buffering. */
		int double_buffer;
		/* Double buffer assignment. */
		char *double_buffer_assignment;
		/* Dump the intermediate code. */
		int dump_code;
		/* Maximal systolic array dimension. */
		int max_sa_dim;
		/* Systolic array type. */
		int sa_type;
		/* Universal tile size. */
		int sa_tile_size;
		/* Tile sizes for PE optimization. */
		char *sa_sizes;
		/* Generate T2S code from tiled program. */
		int t2s_tile;
		/* Phases of T2S codegen for tiled program. */
		int t2s_tile_phase;
		/* Take advantage of FPGA local memory. */
		int use_local_memory;
		/* Maximal amount of local memory. */
		int max_local_memory;
		/* Memory port mapping (for Intel OpenCL). */
		char *mem_port_map;
		/* Enable data pack for transferring data. */
		int data_pack;
		/* Data pack factors at different I/O levels. */
		char *data_pack_sizes;
		/* Enable credit control between different array partitions. */
		int credit_control;
		/* Enable two-level buffering in I/O modules. */
		int two_level_buffer;
		/* Configuration file. */
		char *config;
		/* Output directory. */
		char *output_dir;
		/* SIMD information file. */
		char *simd_info;
		/* Generate HLS host instead of OpenCL host. */
		int hls;
		/* Use URAM. */
		int uram;
		/* Print verbose information. */
		int verbose;
		/* Insert HLS dependence pragma. */
		int insert_hls_dependence;
		/* Embed I/O modules inside PEs. */
		int io_module_embedding;
		/* Enable loop infinitization optimization. Only for Intel. */
		int loop_infinitize;
		/* Enable data serialization/deserialization on the host side. */
		int host_serialize;
		/* Use non-blocking FIFO access. Note: Not supported. */
		int non_block_fifo;
		/* Double buffer coding style. 0: for loop (default) 1: while loop */
		int double_buffer_style;
		/* Enable local reduce */
		int local_reduce;
		/* Reduce op */
		char *reduce_op;
		/* Interior I/O elimination direction. 
		 * 0: set the first dim to 1 (default). 
		 * 1: Set the last dim to 1.
		 */
		/* Select the RAR dependence candidate. */
		char *select_rar_dep;
		int int_io_dir;
		/* Lower the interior I/O module L1 buffer */
		int lower_int_io_L1_buffer;
		/* Use C++ template in codegen (necessary for irregular PEs) */
		int use_cplusplus_template;
		/* Default FIFO depth */
		int fifo_depth;
		/* Touch space loops in the SIMD vectorization */
		int simd_touch_space;
		/* Use block sparsity */
		int block_sparse;
		/* Block sparse ratio [nonzero, vec_len] */
		char* block_sparse_ratio;
		/* Generate code for HeteroCL integration. */
		int hcl;
		/* Apply array contraction. */
		int array_contraction;
		/* Sinking time loops using ISL default APIs. */
		int isl_sink;
		/* Reverse the loop tiling order. */
		int reverse_order;
		/* Use AXI Stream Interface. */
		int axi_stream;
		/* Tuning method: [0: Exhaustive search 1: Others] */
		int tuning_method;
		/* Explore loop permutation in the array partitioning. */
		int explore_loop_permute;
		int loop_permute_order;
		/* Parameter names */
		char *param_names;
		/* Lowering if-branch in inter-trans I/O module. */
		int lower_if_branch;
	};	

	struct ppcg_options
	{
		struct isl_options *isl;
		struct ppcg_debug_options *debug;
		/* Options to pass to the AutoSA compiler. */
		struct autosa_options *autosa;

		/* Group chains of consecutive statements before scheduling. */
		int group_chains;

		/* Use isl to compute a schedule replacing the original schedule. */
		int reschedule;
		int scale_tile_loops;
		int wrap;

		/* Assume all parameters are non-negative. */
		int non_negative_parameters;
		char *ctx;
		char *sizes;

		/* Perform tiling (C target). */
		int tile;
		int tile_size;

		/* Isolate full tiles from partial tiles. */
		int isolate_full_tiles;

		/* Take advantage of private memory. */
		int use_private_memory;

		/* Take advantage of shared memory. */
		int use_shared_memory;

		/* Maximal amount of shared memory. */
		int max_shared_memory;

		/* The target we generate code for. */
		int target;

		/* Generate OpenMP macros (C target only). */
		int openmp;

		/* Linearize all device arrays. */
		int linearize_device_arrays;

		/* Allow the use of GNU extensions in generated code. */
		int allow_gnu_extensions;

		/* Allow live range to be reordered. */
		int live_range_reordering;

		/* Allow hybrid tiling whenever a suitable input pattern is found. */
		int hybrid;

		/* Unroll the code for copying to/from shared memory. */
		int unroll_copy_shared;
		/* Unroll code inside tile on GPU targets. */
		int unroll_gpu_tile;

		/* Options to pass to the OpenCL compiler.  */
		char *opencl_compiler_options;
		/* Prefer GPU device over CPU. */
		int opencl_use_gpu;
		/* Number of files to include. */
		int opencl_n_include_file;
		/* Files to include. */
		const char **opencl_include_files;
		/* Print definitions of types in kernels. */
		int opencl_print_kernel_types;
		/* Embed OpenCL kernel code in host code. */
		int opencl_embed_kernel_code;

		/* Name of file for saving isl computed schedule or NULL. */
		char *save_schedule_file;
		/* Name of file for loading schedule or NULL. */
		char *load_schedule_file;
	};

	ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options,
				 ppcg_debug_options_args)
	ISL_ARG_DECL(autosa_options, struct autosa_options, autosa_options_args)
	ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args)

#define PPCG_TARGET_C 0
#define PPCG_TARGET_CUDA 1
#define PPCG_TARGET_OPENCL 2
#define AUTOSA_TARGET_XILINX_HLS_C 3
#define AUTOSA_TARGET_INTEL_OPENCL 4
#define AUTOSA_TARGET_T2S 5
#define AUTOSA_TARGET_C 6
#define AUTOSA_TARGET_CATAPULT_HLS_C 7
#define AUTOSA_TARGET_TAPA_CPP 8

#define AUTOSA_SA_TYPE_SYNC 0
#define AUTOSA_SA_TYPE_ASYNC 1

	void ppcg_options_set_target_defaults(struct ppcg_options *options);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/print.c
================================================
/*
 * Copyright 2012-2013 Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
 */

#include <isl/ctx.h>
#include <isl/id.h>
#include <isl/aff.h>
#include <isl/ast.h>
#include <isl/ast_build.h>
#include <isl/printer.h>

#include "print.h"
#include "util.h"

__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
{
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "{");
	p = isl_printer_end_line(p);
	p = isl_printer_indent(p, 2);
	return p;
}

__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p)
{
	p = isl_printer_indent(p, -2);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "}");
	p = isl_printer_end_line(p);
	return p;
}

/* Names of notes that keep track of whether min/max
 * macro definitions have already been printed.
 */
static const char *ppcg_max_printed = "ppcg_max_printed";
static const char *ppcg_min_printed = "ppcg_min_printed";

/* Has the macro definition corresponding to "note_name" been printed
 * to "p" before?
 * That is, does "p" have an associated "note_name" note?
 */
static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
{
	isl_ctx *ctx;
	isl_id *id;
	isl_bool printed;

	if (!p)
		return isl_bool_error;

	ctx = isl_printer_get_ctx(p);
	id = isl_id_alloc(ctx, note_name, NULL);
	printed = isl_printer_has_note(p, id);
	isl_id_free(id);

	return printed;
}

/* Keep track of the fact that the macro definition corresponding
 * to "note_name" has been printed to "p" by attaching a note with
 * that name.  The value of the note is of no importance, but it
 * has to be a valid isl_id, so the note identifier is reused
 * as the note.
 */
static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
	const char *note_name)
{
	isl_ctx *ctx;
	isl_id *id;

	if (!p)
		return NULL;

	ctx = isl_printer_get_ctx(p);
	id = isl_id_alloc(ctx, note_name, NULL);
	return isl_printer_set_note(p, id, isl_id_copy(id));
}

/* Print a macro definition "def" for the macro "name" to "p",
 * unless such a macro definition has been printed to "p" before.
 * "note_name" is used as the name of the note that keeps track
 * of whether this printing has happened.
 */
static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
	const char *name, const char *def, const char *note_name)
{
	isl_bool printed;

	printed = printed_before(p, note_name);
	if (printed < 0)
		return isl_printer_free(p);
	if (printed)
		return p;

	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, "#define ");
	p = isl_printer_print_str(p, name);
	p = isl_printer_print_str(p, def);
	p = isl_printer_end_line(p);

	p = mark_printed(p, note_name);

	return p;
}

/* Structure for keeping track of definitions of some macros.
 */
struct ppcg_macros {
	const char *min;
	const char *max;
};

/* Free the memory allocated by a struct ppcg_macros.
 */
static void ppcg_macros_free(void *user)
{
	free(user);
}

/* Default macro definitions (when GNU extensions are allowed).
 */
struct ppcg_macros ppcg_macros_default = {
	.min = "(x,y)    "
		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
		"_x < _y ? _x : _y; })",
	.max = "(x,y)    "
		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
		"_x > _y ? _x : _y; })",
};

/* Name used for the note that keeps track of macro definitions.
 */
static const char *ppcg_macros = "ppcg_macros";

/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
 * to "min" and "max" and store them in "p".
 *
 * In particular, create a ppcg_macros object and attach it
 * as a note to the printer.
 */
__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
	const char *min, const char *max)
{
	isl_ctx *ctx;
	isl_id *id, *macros_id;
	struct ppcg_macros *macros;

	if (!p)
		return NULL;

	ctx = isl_printer_get_ctx(p);
	macros = isl_alloc_type(ctx, struct ppcg_macros);
	if (!macros)
		return isl_printer_free(p);
	macros->min = min;
	macros->max = max;
	id = isl_id_alloc(ctx, ppcg_macros, NULL);
	macros_id = isl_id_alloc(ctx, NULL, macros);
	if (!macros_id)
		ppcg_macros_free(macros);
	else
		macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);

	p = isl_printer_set_note(p, id, macros_id);

	return p;
}

/* Return the ppcg_macros object that holds the currently active
 * macro definitions in "p".
 * If "p" has a note with macro definitions, then return those.
 * Otherwise, return the default macro definitions.
 */
static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
{
	isl_id *id;
	isl_bool has_macros;
	struct ppcg_macros *macros;

	id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
	has_macros = isl_printer_has_note(p, id);
	if (has_macros < 0 || !has_macros) {
		isl_id_free(id);
		if (has_macros < 0)
			return NULL;
		return &ppcg_macros_default;
	}
	id = isl_printer_get_note(p, id);
	macros = isl_id_get_user(id);
	isl_id_free(id);

	return macros;
}

/* Print the currently active macro definition for ppcg_max.
 */
static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
{
	struct ppcg_macros *macros;

	macros = get_macros(p);
	if (!macros)
		return isl_printer_free(p);
	return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
}

/* Print the currently active macro definition for ppcg_min.
 */
static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
{
	struct ppcg_macros *macros;

	macros = get_macros(p);
	if (!macros)
		return isl_printer_free(p);
	return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
}

/* Print a macro definition for "type" to "p".
 * If GNU extensions are allowed, then print a specialized definition
 * for isl_ast_op_min and isl_ast_op_max.
 * Otherwise, use the default isl definition.
 */
__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
	__isl_take isl_printer *p)
{
	isl_ctx *ctx;
	struct ppcg_options *options;

	if (!p)
		return NULL;

	ctx = isl_printer_get_ctx(p);
	options = isl_ctx_peek_options(ctx, &ppcg_options_args);
	if (!options || !options->allow_gnu_extensions)
		return isl_ast_op_type_print_macro(type, p);

	switch (type) {
	case isl_ast_op_max:
		return print_max(p);
	case isl_ast_op_min:
		return print_min(p);
	default:
		return isl_ast_op_type_print_macro(type, p);
	}
}

/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
 * callback that prints a macro definition for "type".
 */
static isl_stat print_macro(enum isl_ast_op_type type, void *user)
{
	isl_printer **p = user;

	*p = ppcg_print_macro(type, *p);
	if (!*p)
		return isl_stat_error;

	return isl_stat_ok;
}

/* Print the required macros for "expr".
 */
__isl_give isl_printer *ppcg_ast_expr_print_macros(
	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
{
	if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
		return isl_printer_free(p);
	return p;
}

/* isl_id_to_ast_expr_foreach callback that prints the required
 * macro definitions for "val".
 */
static isl_stat print_expr_macros(__isl_take isl_id *key,
	__isl_take isl_ast_expr *val, void *user)
{
	isl_printer **p = user;

	*p = ppcg_ast_expr_print_macros(val, *p);
	isl_id_free(key);
	isl_ast_expr_free(val);

	if (!*p)
		return isl_stat_error;
	return isl_stat_ok;
}

/* Print the required macro definitions for the body of a statement in which
 * the access expressions are replaced by the isl_ast_expr objects
 * in "ref2expr".
 */
__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
	__isl_keep isl_id_to_ast_expr *ref2expr)
{
	if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
		return isl_printer_free(p);
	return p;
}

/* Print the required macros for "node".
 */
__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
	__isl_keep isl_ast_node *node)
{
	if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
		return isl_printer_free(p);
	return p;
}

/* Names used for the macros that may appear in a printed isl AST.
 */
const char *ppcg_min = "ppcg_min";
const char *ppcg_max = "ppcg_max";
const char *ppcg_fdiv_q = "ppcg_fdiv_q";

/* Set the names of the macros that may appear in a printed isl AST.
 */
__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
{
	p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
	p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
	p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);

	return p;
}

/* Given a multi affine expression "mpa" without domain, modify it to have
 * the schedule space of "build" as domain.
 *
 * If the schedule space of "build" is a parameter space, then nothing
 * needs to be done.
 * Otherwise, "mpa" is first given a 0D domain and then it is combined
 * with a mapping from the schedule space of "build" to the same 0D domain.
 */
__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
	__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
{
	isl_bool params;
	isl_space *space;
	isl_multi_aff *ma;

	space = isl_ast_build_get_schedule_space(build);
	params = isl_space_is_params(space);
	if (params < 0 || params) {
		isl_space_free(space);
		if (params < 0)
			return isl_multi_pw_aff_free(mpa);
		return mpa;
	}
	space = isl_space_from_domain(space);
	ma = isl_multi_aff_zero(space);
	mpa = isl_multi_pw_aff_from_range(mpa);
	mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);

	return mpa;
}

/* Build an access AST expression from "size" using "build".
 * "size" does not have a domain, but "build" may have a proper schedule space.
 * First modify "size" to have that schedule space as domain.
 */
__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
	__isl_keep isl_ast_build *build)
{
	size = ppcg_attach_multi_pw_aff(size, build);
	return isl_ast_build_access_from_multi_pw_aff(build, size);
}

/* Print a declaration for an array with element type "base_type" and
 * size "size" to "p".
 */
__isl_give isl_printer *ppcg_print_declaration_with_size(
	__isl_take isl_printer *p, const char *base_type,
	__isl_keep isl_ast_expr *size)
{
	if (!base_type || !size)
		return isl_printer_free(p);

	p = ppcg_ast_expr_print_macros(size, p);
	p = isl_printer_start_line(p);
	p = isl_printer_print_str(p, base_type);
	p = isl_printer_print_str(p, " ");
	p = isl_printer_print_ast_expr(p, size);
	p = isl_printer_print_str(p, ";");
	p = isl_printer_end_line(p);

	return p;
}

/* Print a declaration for array "array" to "p", using "build"
 * to simplify any size expressions.
 *
 * The size is computed from the extent of the array and is
 * subsequently converted to an "access expression" by "build".
 */
__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
	struct pet_array *array, __isl_keep isl_ast_build *build)
{
	isl_multi_pw_aff *size;
	isl_ast_expr *expr;

	if (!array)
		return isl_printer_free(p);

	size = ppcg_size_from_extent(isl_set_copy(array->extent));
	expr = isl_ast_build_access_from_multi_pw_aff(build, size);
	p = ppcg_print_declaration_with_size(p, array->element_type, expr);
	isl_ast_expr_free(expr);

	return p;
}

/* Print declarations for the arrays in "scop" that are declared
 * and that are exposed (if exposed == 1) or not exposed (if exposed == 0).
 */
static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p,
	struct ppcg_scop *scop, int exposed)
{
	int i;
	isl_ast_build *build;

	if (!scop)
		return isl_printer_free(p);

	build = isl_ast_build_from_context(isl_set_copy(scop->context));
	for (i = 0; i < scop->pet->n_array; ++i) {
		struct pet_array *array = scop->pet->arrays[i];

		if (!array->declared)
			continue;
		if (array->exposed != exposed)
			continue;

		p = ppcg_print_declaration(p, array, build);
	}
	isl_ast_build_free(build);

	return p;
}

/* Print declarations for the arrays in "scop" that are declared
 * and exposed to the code after the scop.
 */
__isl_give isl_printer *ppcg_print_exposed_declarations(
	__isl_take isl_printer *p, struct ppcg_scop *scop)
{
	return print_declarations(p, scop, 1);
}

/* Print declarations for the arrays in "scop" that are declared,
 * but not exposed to the code after the scop.
 */
__isl_give isl_printer *ppcg_print_hidden_declarations(
	__isl_take isl_printer *p, struct ppcg_scop *scop)
{
	return print_declarations(p, scop, 0);
}


================================================
FILE: src/print.h
================================================
#ifndef PRINT_H
#define PRINT_H

#include <isl/ast.h>

#include "ppcg.h"

#ifdef __cplusplus
extern "C"
{
#endif

	extern const char *ppcg_min;
	extern const char *ppcg_max;
	extern const char *ppcg_fdiv_q;

	__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
	__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);

	__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
	__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
																					const char *min, const char *max);
	__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
																					 __isl_take isl_printer *p);
	__isl_give isl_printer *ppcg_ast_expr_print_macros(
			__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
	__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
																								 __isl_keep isl_id_to_ast_expr *ref2expr);
	__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
																						__isl_keep isl_ast_node *node);

	__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
																								__isl_keep isl_ast_build *build);

	__isl_give isl_printer *ppcg_print_declaration_with_size(
			__isl_take isl_printer *p, const char *base_type,
			__isl_keep isl_ast_expr *size);
	__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
																								 struct pet_array *array, __isl_keep isl_ast_build *build);
	__isl_give isl_printer *ppcg_print_exposed_declarations(
			__isl_take isl_printer *p, struct ppcg_scop *scop);
	__isl_give isl_printer *ppcg_print_hidden_declarations(
			__isl_take isl_printer *p, struct ppcg_scop *scop);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/schedule.c
================================================
/*
 * Copyright 2010-2011 INRIA Saclay
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
 * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
 * 91893 Orsay, France
 */

#include <ctype.h>
#include <stdio.h>
#include <string.h>

#include <isl/set.h>
#include <isl/map.h>
#include <isl/constraint.h>

#include "grouping.h"
#include "schedule.h"

/* Add parameters with identifiers "ids" to "set".
 */
static __isl_give isl_set *add_params(__isl_take isl_set *set,
	__isl_keep isl_id_list *ids)
{
	int i, n;
	unsigned nparam;

	n = isl_id_list_n_id(ids);

	nparam = isl_set_dim(set, isl_dim_param);
	set = isl_set_add_dims(set, isl_dim_param, n);

	for (i = 0; i < n; ++i) {
		isl_id *id;

		id = isl_id_list_get_id(ids, i);
		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
	}

	return set;
}

/* Equate the dimensions of "set" starting at "first" to
 * freshly created parameters with identifiers "ids".
 * The number of equated dimensions is equal to the number of elements in "ids".
 */
static __isl_give isl_set *parametrize(__isl_take isl_set *set,
	int first, __isl_keep isl_id_list *ids)
{
	int i, n;
	unsigned nparam;

	nparam = isl_set_dim(set, isl_dim_param);

	set = add_params(set, ids);

	n = isl_id_list_n_id(ids);
	for (i = 0; i < n; ++i)
		set = isl_set_equate(set, isl_dim_param, nparam + i,
					isl_dim_set, first + i);

	return set;
}

/* Given a parameter space "space", create a set of dimension "len"
 * of which the dimensions starting at "first" are equated to
 * freshly created parameters with identifiers "ids".
 */
__isl_give isl_set *parametrization(__isl_take isl_space *space,
	int len, int first, __isl_keep isl_id_list *ids)
{
	isl_set *set;

	space = isl_space_set_from_params(space);
	space = isl_space_add_dims(space, isl_dim_set, len);
	set = isl_set_universe(space);

	return parametrize(set, first, ids);
}

/* Load and return a schedule from a file called "filename".
 */
static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
	const char *filename)
{
	FILE *file;
	isl_schedule *schedule;

	file = fopen(filename, "r");
	if (!file) {
		fprintf(stderr, "Unable to open '%s' for reading\n", filename);
		return NULL;
	}
	schedule = isl_schedule_read_from_file(ctx, file);
	fclose(file);

	return schedule;
}

/* Save the schedule "schedule" to a file called "filename".
 * The schedule is printed in block style.
 */
static void save_schedule(__isl_keep isl_schedule *schedule,
	const char *filename)
{
	FILE *file;
	isl_ctx *ctx;
	isl_printer *p;

	if (!schedule)
		return;

	file = fopen(filename, "w");
	if (!file) {
		fprintf(stderr, "Unable to open '%s' for writing\n", filename);
		return;
	}
	ctx = isl_schedule_get_ctx(schedule);
	p = isl_printer_to_file(ctx, file);
	p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
	p = isl_printer_print_schedule(p, schedule);
	isl_printer_free(p);
	fclose(file);
}

/* Compute a schedule on the domain of "sc" that respects the schedule
 * constraints in "sc", without trying to combine groups of statements.
 */
__isl_give isl_schedule *ppcg_compute_non_grouping_schedule(
	__isl_take isl_schedule_constraints *sc, struct ppcg_options *options)
{
	if (options->debug->dump_schedule_constraints)
		isl_schedule_constraints_dump(sc);
	return isl_schedule_constraints_compute_schedule(sc);
}

/* Compute a schedule on the domain of "sc" that respects the schedule
 * constraints in "sc".
 *
 * "schedule" is a known correct schedule that is used to combine
 * groups of statements if options->group_chains is set.
 */
__isl_give isl_schedule *ppcg_compute_schedule(
	__isl_take isl_schedule_constraints *sc,
	__isl_keep isl_schedule *schedule, struct ppcg_options *options)
{
	if (options->group_chains)
		return ppcg_compute_grouping_schedule(sc, schedule, options);
	return ppcg_compute_non_grouping_schedule(sc, options);
}

/* Obtain a schedule, either by reading it form a file
 * or by computing it using "compute".
 * Also take care of saving the computed schedule and/or
 * dumping the obtained schedule if requested by the user.
 */
__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
	struct ppcg_options *options,
	__isl_give isl_schedule *(*compute)(void *user), void *user)
{
	isl_schedule *schedule;

	if (options->load_schedule_file) {
		schedule = load_schedule(ctx, options->load_schedule_file);
	} else {
		schedule = compute(user);
		if (options->save_schedule_file)
			save_schedule(schedule, options->save_schedule_file);
	}
	if (options->debug->dump_schedule)
		isl_schedule_dump(schedule);

	return schedule;
}

/* Mark all dimensions in the band node "node" to be of "type".
 */
__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
{
	int i, n;

	n = isl_schedule_node_band_n_member(node);
	for (i = 0; i < n; ++i)
		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
							type);

	return node;
}


================================================
FILE: src/schedule.h
================================================
#ifndef _SCHEDULE_H
#define _SCHEDULE_H

#include <isl/id.h>
#include <isl/space.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>

#include "ppcg_options.h"

#ifdef __cplusplus
extern "C"
{
#endif

	__isl_give isl_set *parametrization(__isl_take isl_space *space,
																			int len, int first, __isl_keep isl_id_list *names);

	__isl_give isl_schedule *ppcg_compute_non_grouping_schedule(
			__isl_take isl_schedule_constraints *sc, struct ppcg_options *options);
	__isl_give isl_schedule *ppcg_compute_schedule(
			__isl_take isl_schedule_constraints *sc,
			__isl_keep isl_schedule *schedule, struct ppcg_options *options);

	__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
																						 struct ppcg_options *options,
																						 __isl_give isl_schedule *(*compute)(void *user), void *user);

	__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
			__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/tests/call.c
================================================
#include <stdlib.h>

void copy_summary(int b[1000], int a[1000], int pos)
{
	b[pos] = 0;
	int c = a[pos];
}

#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos);

int main()
{
	int a[1000], b[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 1000; ++i)
		copy(b, a, i);
#pragma endscop
	for (int i = 0; i < 1000; ++i)
		if (b[i] != a[i])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/call2.c
================================================
#include <stdlib.h>

void copy_summary(int b[1000], int a[1000], int pos)
{
	b[pos] = 0;
	int c = a[pos];
}

#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos);

int main()
{
	int a[2][1000];

	for (int i = 0; i < 1000; ++i)
		a[0][i] = i;
#pragma scop
	for (int i = 0; i < 1000; ++i)
		copy(a[1], a[0], i);
#pragma endscop
	for (int i = 0; i < 1000; ++i)
		if (a[1][i] != a[0][i])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/call2_opencl_functions.cl
================================================
void copy(__global int b[1000], __global int a[1000], int pos)
{
	b[pos] = a[pos];
}


================================================
FILE: src/tests/call3.c
================================================
#include <stdlib.h>

void copy_summary(int b[100], int a[100])
{
	for (int i = 0; i < 100; ++i) {
		b[i] = 0;
		int c = a[i];
	}
}

#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[100], int a[100]);

int main()
{
	int A[100][100], B[100];

	for (int i = 0; i < 100; ++i)
		B[i] = i;
#pragma scop
	for (int i = 0; i < 100; ++i)
		copy(A[i], B);
#pragma endscop
	for (int i = 0; i < 100; ++i)
		for (int j = 0; j < 100; ++j)
			if (A[j][i] != B[i])
				return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/call3_opencl_functions.cl
================================================
void copy(__global int b[100], __global int a[100])
{
	for (int i = 0; i < 100; ++i)
		b[i] = a[i];
}


================================================
FILE: src/tests/call4.c
================================================
#include <stdlib.h>

int inline get(int a[1000], int pos)
{
	int tmp = a[pos];
	return tmp;
}

int main()
{
	int a[1000], b[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 999; ++i)
		b[i] = get(a, i) + get(a, i + 1);
#pragma endscop
	for (int i = 0; i < 999; ++i)
		if (b[i] != a[i] + a[i + 1])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/call5.c
================================================
#include <stdlib.h>

int inline add_one(int i)
{
	return i + 1;
}

int main()
{
	int a[1000], b[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 999; ++i)
		b[i] = add_one(add_one(a[i]));
#pragma endscop
	for (int i = 0; i < 999; ++i)
		if (b[i] != a[i] + 2)
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/call_opencl_functions.cl
================================================
void copy(__global int b[1000], __global int a[1000], int pos)
{
	b[pos] = a[pos];
}


================================================
FILE: src/tests/dead.c
================================================
#include <stdlib.h>

int main()
{
	int a[1000], b[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 1000; ++i) {
		int c;
		int d;
		c = a[i];
		d = c;
		b[i] = c;
	}
#pragma endscop
	for (int i = 0; i < 1000; ++i)
		if (b[i] != a[i])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/iterator.c
================================================
#include <stdlib.h>

int main()
{
	int i;
	int a[101];

	i = 0;
#pragma scop
	for (i = 0; i < 100; ++i)
		a[i] = i;
	a[i] = i;
#pragma endscop
	if (a[100] != 100)
		return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/live_out.c
================================================
#include <stdlib.h>

/* Check that a write access is not removed from the live-out
 * accesses only because a strict subset of the (potentially)
 * accessed elements are killed by a later write.
 */
int main()
{
	int A[10];

	A[1] = 0;
#pragma scop
	int i = 1;
	i = i * i;
	A[i] = 1;
	A[0] = 0;
#pragma endscop
	if (A[1] != 1)
		return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/local.c
================================================
#include <stdlib.h>

int main()
{
	int A[100];

#pragma scop
	{
		int B[100];
		B[0] = 0;
		for (int i = 1; i < 100; ++i)
			B[i] = B[i - 1] + 1;
		for (int i = 0; i < 100; ++i)
			A[i] = B[i];
	}
#pragma endscop
	for (int i = 0; i < 100; ++i)
		if (A[i] != i)
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/loop.c
================================================
#include <stdlib.h>

int main()
{
	int a[1000], b[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 1000; ++i)
		b[i] = a[i];
#pragma endscop
	for (int i = 0; i < 1000; ++i)
		if (b[i] != a[i])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/not_accessed.c
================================================
#include <stdlib.h>

void copy_summary(int b[1000], int a[1000], int pos, int c[1000])
{
	b[pos] = 0;
	int d = a[pos];
}

#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos, int c[1000]);

int main()
{
	int a[1000], b[1000], c[1000];

	for (int i = 0; i < 1000; ++i)
		a[i] = i;
#pragma scop
	for (int i = 0; i < 1000; ++i)
		copy(b, a, i, c);
#pragma endscop
	for (int i = 0; i < 1000; ++i)
		if (b[i] != a[i])
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/not_accessed_opencl_functions.cl
================================================
void copy(__global int b[1000], __global int a[1000], int pos,
	__global int c[1000])
{
	b[pos] = a[pos];
}


================================================
FILE: src/tests/scalar.c
================================================
#include <stdlib.h>

int main()
{
	int a;
#pragma scop
	a = 1;
#pragma endscop
	if (a != 1)
		return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/shared_sink.c
================================================
#include <stdlib.h>

/* Check that the sources of live ranges with the same sink
 * are executed in order.
 */
int main()
{
	int A[128];
	int n = 128;

	A[0] = 0;
#pragma scop
	for (int i = 0; i < n; ++i) {
		int set = 0;
		if (A[i] < 2)
			set = 1;
		if (set)
			A[i] = 2;
	}
#pragma endscop
	if (A[0] != 2)
		return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/struct.c
================================================
#include <stdlib.h>

struct s {
	int c[10][10];
};

int main()
{
	struct s a[10][10], b[10][10];

	for (int i = 0; i < 10; ++i)
		for (int j = 0; j < 10; ++j)
			for (int k = 0; k < 10; ++k)
				for (int l = 0; l < 10; ++l)
					a[i][j].c[k][l] = i + j + k + l;
#pragma scop
	for (int i = 0; i < 10; ++i)
		for (int j = 0; j < 10; ++j)
			for (int k = 0; k < 10; ++k)
				for (int l = 0; l < 10; ++l)
					b[i][j].c[k][l] = i + j + k + l;
#pragma endscop
	for (int i = 0; i < 10; ++i)
		for (int j = 0; j < 10; ++j)
			for (int k = 0; k < 10; ++k)
				for (int l = 0; l < 10; ++l)
					if (b[i][j].c[k][l] != a[i][j].c[k][l])
						return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/struct2.c
================================================
#include <stdlib.h>

struct s {
	int a;
};

int main()
{
	struct s a, b[10];

#pragma scop
	a.a = 42;
	for (int i = 0; i < 10; ++i)
		b[i].a = a.a;
#pragma endscop
	for (int i = 0; i < 10; ++i)
		if (b[i].a != 42)
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/struct3.c
================================================
#include <stdlib.h>

struct s {
	int a;
	int b;
};

int main()
{
	struct s a, b[10];

	a.b = 57;
#pragma scop
	a.a = 42;
	for (int i = 0; i < 10; ++i)
		b[i] = a;
#pragma endscop
	for (int i = 0; i < 10; ++i)
		if (b[i].a != 42)
			return EXIT_FAILURE;
	if (a.b != 57)
		return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/struct4.c
================================================
#include <stdlib.h>

struct s {
	int a;
	int b;
};

int main()
{
	int a[10];

	for (int i = 0; i < 10; ++i)
		a[i] = 0;
#pragma scop
	for (int i = 0; i < 10; ++i) {
		struct s b;
		b.a = 1;
		b.b = i;
		a[i] = b.a + b.b;
	}
#pragma endscop
	for (int i = 0; i < 10; ++i)
		if (a[i] != 1 + i)
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/tests/struct5.c
================================================
#include <stdlib.h>

struct s {
	int a;
	int b;
};

int main()
{
	int a[10];

	for (int i = 0; i < 10; ++i)
		a[i] = 0;
#pragma scop
	for (int i = 0; i < 10; ++i) {
		struct s b[1];
		b[0].a = 1;
		b[0].b = i;
		a[i] = b[0].a + b[0].b;
	}
#pragma endscop
	for (int i = 0; i < 10; ++i)
		if (a[i] != 1 + i)
			return EXIT_FAILURE;

	return EXIT_SUCCESS;
}


================================================
FILE: src/util.c
================================================
/*
 * Copyright 2012-2013 Ecole Normale Superieure
 *
 * Use of this software is governed by the MIT license
 *
 * Written by Sven Verdoolaege,
 * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
 */

#include <isl/space.h>
#include <isl/val.h>
#include <isl/aff.h>
#include <isl/set.h>

#include "util.h"

/* Construct an isl_multi_val living in "space" with all values equal to "val".
 */
__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
	int val)
{
	int i, n;
	isl_ctx *ctx;
	isl_val *v;
	isl_multi_val *mv;

	if (!space)
		return NULL;

	ctx = isl_space_get_ctx(space);
	n = isl_space_dim(space, isl_dim_set);
	mv = isl_multi_val_zero(space);
	v = isl_val_int_from_si(ctx, val);
	for (i = 0; i < n; ++i)
		mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
	isl_val_free(v);

	return mv;
}

/* Construct an isl_multi_val living in "space" with values specified
 * by "list".  "list" is assumed to have at least as many entries
 * as the set dimension of "space".
 */
__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
	__isl_take isl_space *space, int *list)
{
	int i, n;
	isl_ctx *ctx;
	isl_multi_val *mv;

	if (!space)
		return NULL;

	ctx = isl_space_get_ctx(space);
	n = isl_space_dim(space, isl_dim_set);
	mv = isl_multi_val_zero(space);
	for (i = 0; i < n; ++i) {
		isl_val *v;

		v = isl_val_int_from_si(ctx, list[i]);
		mv = isl_multi_val_set_val(mv, i, v);
	}

	return mv;
}

/* Compute the size of a bounding box around the origin and "set",
 * where "set" is assumed to contain only non-negative elements.
 * In particular, compute the maximal value of "set" in each direction
 * and add one.
 */
__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
{
	int i, n;
	isl_multi_pw_aff *mpa;

	n = isl_set_dim(set, isl_dim_set);
	mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
	for (i = 0; i < n; ++i) {
		isl_space *space;
		isl_aff *one;
		isl_pw_aff *bound;

		if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
			const char *name;
			name = isl_set_get_tuple_name(set);
			if (!name)
				name = "";
			fprintf(stderr, "unable to determine extent of '%s' "
				"in dimension %d\n", name, i);
			set = isl_set_free(set);
		}
		bound = isl_set_dim_max(isl_set_copy(set), i);

		space = isl_pw_aff_get_domain_space(bound);
		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
		one = isl_aff_add_constant_si(one, 1);
		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
		mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
	}
	isl_set_free(set);

	return mpa;
}


================================================
FILE: src/util.h
================================================
#ifndef UTIL_H
#define UTIL_H

#include <string.h>

#include <isl/space.h>
#include <isl/val.h>

#ifdef __cplusplus
extern "C"
{
#endif

	/* Compare the prefix of "s" to "prefix" up to the length of "prefix".
 */
	static inline int prefixcmp(const char *s, const char *prefix)
	{
		return strncmp(s, prefix, strlen(prefix));
	}

	__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
																										int val);
	__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
			__isl_take isl_space *space, int *list);
	__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: src/version.c
================================================
#include "gitversion.h"

const char *ppcg_version(void)
{
	return GIT_HEAD_ID"\n";
}